diff --git a/.github/workflows/build-sphinx.yml b/.github/workflows/build-sphinx.yml index 31423357e3f3..05f5b2711e24 100644 --- a/.github/workflows/build-sphinx.yml +++ b/.github/workflows/build-sphinx.yml @@ -26,7 +26,7 @@ jobs: name: Build and Deploy Docs runs-on: ubuntu-22.04 - timeout-minutes: 60 + timeout-minutes: 90 permissions: # Needed to cancel any previous runs that are not completed for a given workflow diff --git a/.github/workflows/check-onemath.yaml b/.github/workflows/check-onemath.yaml index bead55d5f495..3ad8ba1ee84e 100644 --- a/.github/workflows/check-onemath.yaml +++ b/.github/workflows/check-onemath.yaml @@ -74,7 +74,7 @@ jobs: os: [ubuntu-22.04] # windows-2022 - no DFT support for Windows in oneMKL runs-on: ${{ matrix.os }} - timeout-minutes: 60 + timeout-minutes: 120 defaults: run: @@ -133,6 +133,14 @@ jobs: if: env.rerun-tests-on-failure != 'true' run: | python -m pytest -ra --pyargs dpnp.tests + env: + SKIP_TENSOR_TESTS: 1 + SYCL_CACHE_PERSISTENT: 1 + + - name: Run tensor tests + if: env.rerun-tests-on-failure != 'true' + run: | + python -m pytest -ra --pyargs dpnp.tests.tensor env: SYCL_CACHE_PERSISTENT: 1 @@ -150,6 +158,24 @@ jobs: mamba activate ${{ env.test-env-name }} python -m pytest -ra --pyargs dpnp.tests + env: + SKIP_TENSOR_TESTS: 1 + SYCL_CACHE_PERSISTENT: 1 + + - name: ReRun tensor tests on Linux + if: env.rerun-tests-on-failure == 'true' + id: run_tensor_tests + uses: nick-fields/retry@ce71cc2ab81d554ebbe88c79ab5975992d79ba08 # v3.0.2 + with: + timeout_minutes: ${{ env.rerun-tests-timeout }} + max_attempts: ${{ env.rerun-tests-max-attempts }} + retry_on: any + command: | + . $CONDA/etc/profile.d/conda.sh + . $CONDA/etc/profile.d/mamba.sh + mamba activate ${{ env.test-env-name }} + + python -m pytest -ra --pyargs dpnp.tests.tensor env: SYCL_CACHE_PERSISTENT: 1 @@ -239,6 +265,14 @@ jobs: if: env.rerun-tests-on-failure != 'true' run: | python -m pytest -ra --pyargs dpnp.tests + env: + SKIP_TENSOR_TESTS: 1 + SYCL_CACHE_PERSISTENT: 1 + + - name: Run tensor tests + if: env.rerun-tests-on-failure != 'true' + run: | + python -m pytest -ra --pyargs dpnp.tests.tensor env: SYCL_CACHE_PERSISTENT: 1 @@ -256,5 +290,23 @@ jobs: mamba activate ${{ env.test-env-name }} python -m pytest -ra --pyargs dpnp.tests + env: + SKIP_TENSOR_TESTS: 1 + SYCL_CACHE_PERSISTENT: 1 + + - name: ReRun tensor tests on Linux + if: env.rerun-tests-on-failure == 'true' + id: run_tensor_tests_branch + uses: nick-fields/retry@ce71cc2ab81d554ebbe88c79ab5975992d79ba08 # v3.0.2 + with: + timeout_minutes: ${{ env.rerun-tests-timeout }} + max_attempts: ${{ env.rerun-tests-max-attempts }} + retry_on: any + command: | + . $CONDA/etc/profile.d/conda.sh + . $CONDA/etc/profile.d/mamba.sh + mamba activate ${{ env.test-env-name }} + + python -m pytest -ra --pyargs dpnp.tests.tensor env: SYCL_CACHE_PERSISTENT: 1 diff --git a/.github/workflows/conda-package.yml b/.github/workflows/conda-package.yml index 886204654a98..afd34ee00543 100644 --- a/.github/workflows/conda-package.yml +++ b/.github/workflows/conda-package.yml @@ -37,7 +37,7 @@ jobs: actions: write runs-on: ${{ matrix.os }} - timeout-minutes: 60 + timeout-minutes: 90 defaults: run: @@ -220,6 +220,7 @@ jobs: - name: Run tests if: env.rerun-tests-on-failure != 'true' run: | + export SKIP_TENSOR_TESTS=1 if [[ "${{ matrix.python }}" == "${{ env.python-ver-test-all-dtypes }}" ]]; then export DPNP_TEST_ALL_INT_TYPES=1 python -m pytest -ra --pyargs ${{ env.package-name }}.tests @@ -239,6 +240,7 @@ jobs: . $CONDA/etc/profile.d/conda.sh . $CONDA/etc/profile.d/mamba.sh mamba activate ${{ env.test-env-name }} + export SKIP_TENSOR_TESTS=1 if [[ "${{ matrix.python }}" == "${{ env.python-ver-test-all-dtypes }}" ]]; then export DPNP_TEST_ALL_INT_TYPES=1 @@ -247,6 +249,26 @@ jobs: python -m pytest -n auto -ra --pyargs ${{ env.package-name }}.tests fi + - name: Run tensor tests + if: env.rerun-tests-on-failure != 'true' + run: | + python -m pytest -n auto -ra --pyargs dpnp.tests.tensor + + - name: Run tensor tests + if: env.rerun-tests-on-failure == 'true' + id: run_tests_tensor_linux + uses: nick-fields/retry@ce71cc2ab81d554ebbe88c79ab5975992d79ba08 # v3.0.2 + with: + timeout_minutes: ${{ env.rerun-tests-timeout }} + max_attempts: ${{ env.rerun-tests-max-attempts }} + retry_on: any + command: | + . $CONDA/etc/profile.d/conda.sh + . $CONDA/etc/profile.d/mamba.sh + mamba activate ${{ env.test-env-name }} + + python -m pytest -n auto -ra --pyargs dpnp.tests.tensor + test_windows: name: Test @@ -382,6 +404,7 @@ jobs: if: env.rerun-tests-on-failure != 'true' shell: pwsh run: | + $env:SKIP_TENSOR_TESTS=1 if (${{ matrix.python }} -eq ${{ env.python-ver-test-all-dtypes }}) { $env:DPNP_TEST_ALL_INT_TYPES=1 python -m pytest -ra --pyargs ${{ env.package-name }}.tests @@ -399,6 +422,7 @@ jobs: retry_on: any shell: pwsh command: | + $env:SKIP_TENSOR_TESTS=1 if ( ${{ matrix.python }} -eq ${{ env.python-ver-test-all-dtypes }} ) { $env:DPNP_TEST_ALL_INT_TYPES=1 python -m pytest -ra --pyargs ${{ env.package-name }}.tests @@ -406,6 +430,24 @@ jobs: python -m pytest -n auto -ra --pyargs ${{ env.package-name }}.tests } + - name: Run tensor tests + if: env.rerun-tests-on-failure != 'true' + shell: pwsh + run: | + python -m pytest -n auto -ra --pyargs dpnp.tests.tensor + + - name: Run tensor tests + if: env.rerun-tests-on-failure == 'true' + id: run_tests_tensor_win + uses: nick-fields/retry@ce71cc2ab81d554ebbe88c79ab5975992d79ba08 # v3.0.2 + with: + timeout_minutes: ${{ env.rerun-tests-timeout }} + max_attempts: ${{ env.rerun-tests-max-attempts }} + retry_on: any + shell: pwsh + command: | + python -m pytest -n auto -ra --pyargs dpnp.tests.tensor + upload: name: Upload diff --git a/.github/workflows/generate_coverage.yaml b/.github/workflows/generate_coverage.yaml index 5fd211e55a81..3d5d34531adf 100644 --- a/.github/workflows/generate_coverage.yaml +++ b/.github/workflows/generate_coverage.yaml @@ -11,7 +11,7 @@ jobs: name: Generate coverage and push to Coveralls.io runs-on: ubuntu-latest - timeout-minutes: 120 + timeout-minutes: 150 permissions: # Needed to cancel any previous runs that are not completed for a given workflow @@ -122,7 +122,7 @@ jobs: uses: nick-fields/retry@ad984534de44a9489a53aefd81eb77f87c70dc60 # v4.0.0 with: shell: bash - timeout_minutes: 60 + timeout_minutes: 120 max_attempts: 5 retry_on: error command: | @@ -130,6 +130,7 @@ jobs: conda activate coverage [ -f /opt/intel/oneapi/setvars.sh ] && source /opt/intel/oneapi/setvars.sh git clean -fxd + export SKIP_TENSOR_TESTS=1 python scripts/gen_coverage.py - name: Total number of coverage attempts diff --git a/.gitignore b/.gitignore index 5d2725d3186f..f66bfbb3fdd8 100644 --- a/.gitignore +++ b/.gitignore @@ -28,6 +28,7 @@ dpnp_pytest.* example3 *dpnp_backend* +dpnp/include/dpnp/tensor/*.h dpnp/**/*.cpython*.so dpnp/**/*.pyd *~ diff --git a/CHANGELOG.md b/CHANGELOG.md index f8aaae542ec5..bf659a351a57 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [0.20.0] - MM/DD/2026 +This release introduces a major architectural change: the Array API-compliant tensor implementation has been migrated from `dpctl.tensor` into `dpnp.tensor`, simplifying maintenance, reducing cross-project dependencies, and allows the tensor implementation to evolve within `dpnp`. This release changes the license from `BSD-2-Clause` to `BSD-3-Clause`. This release achieves `dpnp` compatibility with Python 3.14 and enables distributing `dpnp` packages with the latest Python version. Also, that release drops support for Python 3.9, making Python 3.10 the minimum required version. @@ -28,6 +29,7 @@ Also, that release drops support for Python 3.9, making Python 3.10 the minimum * Added implementation of `dpnp.isin` function [#2595](https://github.com/IntelPython/dpnp/pull/2595) * Added implementation of `dpnp.scipy.linalg.lu` (SciPy-compatible) [#2787](https://github.com/IntelPython/dpnp/pull/2787) * Added support for ndarray subclassing via `dpnp.ndarray.view` method with `type` parameter [#2815](https://github.com/IntelPython/dpnp/issues/2815) +* Migrated tensor implementation from `dpctl.tensor` into `dpnp.tensor`, making `dpnp` the primary owner of the Array API-compliant tensor layer [#2856](https://github.com/IntelPython/dpnp/pull/2856) ### Changed @@ -84,6 +86,7 @@ Also, that release drops support for Python 3.9, making Python 3.10 the minimum * Resolved an issue with strides calculation in `dpnp.diagonal` to return correct values for empty diagonals [#2814](https://github.com/IntelPython/dpnp/pull/2814) * Fixed test tolerance issues for float16 intermediate precision that became visible when testing against conda-forge's NumPy [#2828](https://github.com/IntelPython/dpnp/pull/2828) * Ensured device aware dtype handling in `dpnp.identity` and `dpnp.gradient` [#2835](https://github.com/IntelPython/dpnp/pull/2835) +* Fixed `dpnp.tensor.round` to use device-aware output dtype for boolean input [#2851](https://github.com/IntelPython/dpnp/pull/2851) ### Security diff --git a/CMakeLists.txt b/CMakeLists.txt index 129bf1d87c25..b5c1068c1677 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -37,12 +37,23 @@ project( ) option(DPNP_GENERATE_COVERAGE "Enable build DPNP with coverage instrumentation" OFF) +option( + DPNP_TENSOR_GENERATE_COVERAGE_FOR_PYBIND11_EXTENSIONS + "Build dpnp tensor pybind11 offloading extensions with coverage instrumentation" + OFF +) option(DPNP_BACKEND_TESTS "Enable building of DPNP backend test suite" OFF) option( DPNP_WITH_REDIST "Build DPNP assuming DPC++ redistributable is installed into Python prefix" OFF ) +option( + DPNP_TENSOR_OFFLOAD_COMPRESS + "Build dpnp tensor using offload section compression feature of DPC++ to reduce \ +size of shared object with offloading sections" + OFF +) set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD_REQUIRED True) @@ -106,7 +117,6 @@ find_package(Cython REQUIRED) find_package(Dpctl REQUIRED) message(STATUS "Dpctl_INCLUDE_DIR=" ${Dpctl_INCLUDE_DIR}) -message(STATUS "Dpctl_TENSOR_INCLUDE_DIR=" ${Dpctl_TENSOR_INCLUDE_DIR}) option(DPNP_USE_ONEMATH "Build DPNP with oneMath" OFF) set(DPNP_TARGET_CUDA diff --git a/doc/conf.py b/doc/conf.py index 469e6d5f5353..57119eab5396 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -6,6 +6,7 @@ # http://www.sphinx-doc.org/en/master/config from datetime import datetime +from urllib.parse import urljoin from sphinx.ext.autodoc import FunctionDocumenter from sphinx.ext.napoleon import NumpyDocstring, docstring @@ -231,6 +232,9 @@ def _can_document_member(member, *args, **kwargs): autosummary_generate = True +_DPCTL_021_BASE = "https://intelpython.github.io/dpctl/0.21.1/" +_DPCTL_021_INV = urljoin(_DPCTL_021_BASE, "objects.inv") + intersphinx_mapping = { "python": ("https://docs.python.org/3/", None), "numpy": ("https://numpy.org/doc/stable/", None), @@ -302,3 +306,65 @@ def _parse_returns_section_patched(self, section: str) -> list[str]: NumpyDocstring._parse_returns_section = _parse_returns_section_patched + + +# TODO: Remove once dpnp.tensor docs are generated in dpnp +def _load_dpctl_tensor_inventory(app): + """Load dpctl 0.21.1 inventory for dpnp.tensor fallback only.""" + from sphinx.ext.intersphinx import fetch_inventory + from sphinx.util import logging + + logger = logging.getLogger(__name__) + + try: + inv = fetch_inventory(app, _DPCTL_021_BASE, _DPCTL_021_INV) + except Exception as exc: + logger.warning( + "Failed to load dpctl 0.21.1 inventory from %s: %s", + _DPCTL_021_INV, + exc, + ) + inv = {} + + app.builder.env._dpctl_tensor_021_inventory = inv + + +# TODO: Remove once dpnp.tensor docs are generated in dpnp +def _resolve_dpnp_tensor_refs(app, env, node, contnode): + """Resolve dpnp.tensor.* references to dpctl 0.21.1 documentation. + + This temporary workaround is needed because dpnp.tensor documentation + is not generated yet, while the corresponding API is still documented + in dpctl 0.21.1. + """ + from docutils import nodes as docutils_nodes + + target = node.get("reftarget", "") + if not target.startswith("dpnp.tensor"): + return None + + dpctl_target = target.replace("dpnp.tensor", "dpctl.tensor", 1) + dpctl_tensor_inv = getattr(env, "_dpctl_tensor_021_inventory", {}) + + for _objtype, objects in dpctl_tensor_inv.items(): + if dpctl_target not in objects: + continue + + item = objects[dpctl_target] + location = item.uri + if location.endswith("$"): + location = location[:-1] + dpctl_target + + refuri = urljoin(_DPCTL_021_BASE, location) + newnode = docutils_nodes.reference( + "", "", internal=False, refuri=refuri + ) + newnode += contnode.deepcopy() + return newnode + + return None + + +def setup(app): + app.connect("builder-inited", _load_dpctl_tensor_inventory, priority=400) + app.connect("missing-reference", _resolve_dpnp_tensor_refs, priority=400) diff --git a/doc/index.rst b/doc/index.rst index 38c12489636b..847680fc11d9 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -13,6 +13,7 @@ Data Parallel Extension for NumPy* overview quick_start_guide reference/index + tensor .. toctree:: :maxdepth: 1 diff --git a/doc/reference/exceptions.rst b/doc/reference/exceptions.rst index 8f459b9f3aaa..69980ac8d8c2 100644 --- a/doc/reference/exceptions.rst +++ b/doc/reference/exceptions.rst @@ -20,7 +20,7 @@ Exceptions .. exception:: DLPackCreationError Given when constructing DLPack capsule from either :class:`dpnp.ndarray` or - :class:`dpctl.tensor.usm_ndarray` based on a USM allocation + :class:`dpnp.tensor.usm_ndarray` based on a USM allocation on a partitioned SYCL device. .. rubric:: Examples diff --git a/doc/tensor.rst b/doc/tensor.rst new file mode 100644 index 000000000000..22a1812f38a3 --- /dev/null +++ b/doc/tensor.rst @@ -0,0 +1,70 @@ +.. _tensor: + +Tensor (``dpnp.tensor``) +======================== + +``dpnp.tensor`` provides a reference implementation of the +`Python Array API `_ specification. +The implementation uses data-parallel algorithms suitable for execution on +accelerators, such as GPUs. + +It also provides the underlying Array API-compliant implementation +used by ``dpnp``. + +``dpnp.tensor`` is written using C++ and +`SYCL `_ +and oneAPI extensions implemented in +`Intel(R) oneAPI DPC++ compiler `_. + +Design and Motivation +--------------------- + +The tensor implementation was originally developed as a standalone project and +later integrated into the `dpctl `_ +library as ``dpctl.tensor``. It has since been migrated into ``dpnp``, +making ``dpnp`` the primary owner and development location of the tensor implementation. + +This change simplifies maintenance, reduces cross-project +dependencies, and enables independent development and release cycles. + +Relationship to ``dpnp.ndarray`` +-------------------------------- + +:class:`dpnp.ndarray` is a high-level array object built on top of +``dpnp.tensor.usm_ndarray``, storing array data in Unified Shared Memory +(USM) allocated on a SYCL device. Most users interact with +:class:`dpnp.ndarray` directly; ``dpnp.tensor.usm_ndarray`` may appear in error +messages or type signatures when working with device placement or +interoperability. + +Relationship to ``dpctl`` +------------------------- + +The migration of ``dpctl.tensor`` into ``dpnp.tensor`` does not replace +`dpctl `_ itself. +``dpctl`` remains responsible for device and queue management +(:class:`dpctl.SyclDevice`, :class:`dpctl.SyclQueue`) as well as USM memory +allocation. ``dpnp`` builds on top of these capabilities. + +Example +------- + +.. code-block:: python + + import dpnp + import dpnp.tensor as dpt + + # Create a tensor array on the default device + x = dpt.asarray([1.0, 2.0, 3.0]) + + # dpnp.ndarray wraps the underlying usm_ndarray + a = dpnp.asarray([1.0, 2.0, 3.0]) + assert isinstance(a.get_array(), dpt.usm_ndarray) + +.. note:: + + The ``dpnp.tensor`` API documentation will be added in a future release. + + The current implementation remains compatible with the original + ``dpctl.tensor`` API. For the complete API reference, see the + `dpctl 0.21.1 tensor documentation `_. diff --git a/dpnp/CMakeLists.txt b/dpnp/CMakeLists.txt index 6850b799735c..d7acf368bcd0 100644 --- a/dpnp/CMakeLists.txt +++ b/dpnp/CMakeLists.txt @@ -86,11 +86,96 @@ function(build_dpnp_cython_ext _trgt _src _dest) install(TARGETS ${_trgt} LIBRARY DESTINATION ${_dest}) endfunction() +function(build_dpnp_tensor_ext _trgt _src _dest) + set(options SYCL) + cmake_parse_arguments(BUILD_DPNP_TENSOR "${options}" "RELATIVE_PATH" "" ${ARGN}) + add_cython_target(${_trgt} ${_src} CXX OUTPUT_VAR _generated_src) + set(_cythonize_trgt "${_trgt}_cythonize_pyx") + python_add_library(${_trgt} MODULE WITH_SOABI ${_generated_src}) + if(BUILD_DPNP_TENSOR_SYCL) + add_sycl_to_target(TARGET ${_trgt} SOURCES ${_generated_src}) + target_compile_options(${_trgt} PRIVATE -fno-sycl-id-queries-fit-in-int) + target_link_options(${_trgt} PRIVATE -fsycl-device-code-split=per_kernel) + if(DPNP_TENSOR_OFFLOAD_COMPRESS) + target_link_options(${_trgt} PRIVATE --offload-compress) + endif() + if(_dpnp_sycl_targets) + # make fat binary + target_compile_options( + ${_trgt} + PRIVATE ${_dpnp_sycl_target_compile_options} + ) + target_link_options(${_trgt} PRIVATE ${_dpnp_sycl_target_link_options}) + endif() + endif() + target_link_libraries(${_trgt} PRIVATE Python::NumPy) + if(DPNP_GENERATE_COVERAGE) + target_compile_definitions(${_trgt} PRIVATE CYTHON_TRACE=1 CYTHON_TRACE_NOGIL=1) + if(BUILD_DPNP_TENSOR_SYCL) + target_compile_options(${_trgt} PRIVATE -fno-sycl-use-footer) + endif() + endif() + # Dpctl + target_include_directories(${_trgt} PRIVATE ${Dpctl_INCLUDE_DIR}) + target_link_directories(${_trgt} PRIVATE ${Dpctl_INCLUDE_DIR}/..) + target_link_libraries(${_trgt} PRIVATE DPCTLSyclInterface) + set(_linker_options "LINKER:${DPNP_LDFLAGS}") + target_link_options(${_trgt} PRIVATE ${_linker_options}) + get_filename_component(_name_wle ${_generated_src} NAME_WLE) + get_filename_component(_generated_src_dir ${_generated_src} DIRECTORY) + set(_generated_public_h "${_generated_src_dir}/${_name_wle}.h") + set(_generated_api_h "${_generated_src_dir}/${_name_wle}_api.h") + + # TODO: create separate folder inside build folder that contains only + # headers related to this target and appropriate folder structure to + # eliminate shadow dependencies + # Go up two levels to build root for "dpnp/tensor/_usmarray.h" resolution + get_filename_component(_parent_dir ${_generated_src_dir} DIRECTORY) + get_filename_component(_build_root ${_parent_dir} DIRECTORY) + # TODO: do not set directory if we did not generate header + target_include_directories(${_trgt} INTERFACE ${_build_root}) + set(_rpath_value "$ORIGIN") + if(BUILD_DPNP_TENSOR_RELATIVE_PATH) + set(_rpath_value "${_rpath_value}/${BUILD_DPNP_TENSOR_RELATIVE_PATH}") + endif() + if(DPNP_WITH_REDIST) + set(_rpath_value "${_rpath_value}:${_rpath_value}/../../..") + endif() + set_target_properties(${_trgt} PROPERTIES INSTALL_RPATH ${_rpath_value}) + + install(TARGETS ${_trgt} LIBRARY DESTINATION ${_dest}) + install( + FILES ${_generated_api_h} + DESTINATION ${CMAKE_INSTALL_PREFIX}/dpnp/include/${_dest} + OPTIONAL + ) + install( + FILES ${_generated_public_h} + DESTINATION ${CMAKE_INSTALL_PREFIX}/dpnp/include/${_dest} + OPTIONAL + ) + if(DPNP_GENERATE_COVERAGE) + get_filename_component(_original_src_dir ${_src} DIRECTORY) + file(RELATIVE_PATH _rel_dir ${CMAKE_SOURCE_DIR} ${_original_src_dir}) + install(FILES ${_generated_src} DESTINATION ${CMAKE_INSTALL_PREFIX}/${_rel_dir}) + endif() + + # Create target with headers only, because python is managing all the + # library imports at runtime + set(_trgt_headers ${_trgt}_headers) + add_library(${_trgt_headers} INTERFACE) + add_dependencies(${_trgt_headers} ${_trgt}) + get_target_property(_trgt_headers_dir ${_trgt} INTERFACE_INCLUDE_DIRECTORIES) + target_include_directories(${_trgt_headers} INTERFACE ${_trgt_headers_dir}) +endfunction() + function(build_dpnp_cython_ext_with_backend _trgt _src _dest) build_dpnp_cython_ext(${_trgt} ${_src} ${_dest}) target_link_libraries(${_trgt} PRIVATE dpnp_backend_library) endfunction() +add_subdirectory(tensor) + add_subdirectory(backend) add_subdirectory(backend/extensions/blas) add_subdirectory(backend/extensions/fft) diff --git a/dpnp/__init__.py b/dpnp/__init__.py index 02420107972f..d2ea158d4d44 100644 --- a/dpnp/__init__.py +++ b/dpnp/__init__.py @@ -28,7 +28,6 @@ import os import sys -import warnings mypath = os.path.dirname(os.path.realpath(__file__)) @@ -61,10 +60,7 @@ [os.getenv("PATH", ""), dll_path] ) -# Borrowed from DPCTL -with warnings.catch_warnings(): - warnings.simplefilter("ignore", DeprecationWarning) - from dpctl.tensor import __array_api_version__, DLDeviceType +from .tensor import __array_api_version__, DLDeviceType from .dpnp_array import dpnp_array as ndarray from .dpnp_array_api_info import __array_namespace_info__ diff --git a/dpnp/__main__.py b/dpnp/__main__.py new file mode 100644 index 000000000000..1c9c652109ee --- /dev/null +++ b/dpnp/__main__.py @@ -0,0 +1,78 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import argparse +import importlib +import os +import os.path +import sys + + +def _dpnp_dir() -> str: + dpnp_dir = importlib.util.find_spec("dpnp").submodule_search_locations[0] + abs_dpnp_dir = os.path.abspath(dpnp_dir) + return abs_dpnp_dir + + +def get_tensor_include_dir() -> str: + """Prints path to dpnp libtensor include directory""" + dpnp_dir = _dpnp_dir() + libtensor_dir = os.path.join(dpnp_dir, "tensor", "libtensor", "include") + return libtensor_dir + + +def print_tensor_include_flags() -> None: + """Prints include flags for dpnp tensor library""" + libtensor_dir = get_tensor_include_dir() + print("-I " + libtensor_dir) + + +def main() -> None: + """Main entry-point.""" + parser = argparse.ArgumentParser() + parser.add_argument( + "--tensor-includes", + action="store_true", + help="Include flags for dpnp libtensor headers.", + ) + parser.add_argument( + "--tensor-include-dir", + action="store_true", + help="Path to dpnp libtensor include directory.", + ) + args = parser.parse_args() + if not sys.argv[1:]: + parser.print_help() + if args.tensor_includes: + print_tensor_include_flags() + if args.tensor_include_dir: + print(get_tensor_include_dir()) + + +if __name__ == "__main__": + main() diff --git a/dpnp/backend/CMakeLists.txt b/dpnp/backend/CMakeLists.txt index ddca557a08f4..433ab298d476 100644 --- a/dpnp/backend/CMakeLists.txt +++ b/dpnp/backend/CMakeLists.txt @@ -89,7 +89,6 @@ target_compile_definitions(${_trgt} PUBLIC PSTL_USE_PARALLEL_POLICIES=0) target_compile_definitions(${_trgt} PUBLIC ONEDPL_USE_PREDEFINED_POLICIES=0) target_include_directories(${_trgt} PUBLIC ${Dpctl_INCLUDE_DIR}) -target_include_directories(${_trgt} PUBLIC ${Dpctl_TENSOR_INCLUDE_DIR}) target_link_directories(${_trgt} PUBLIC "${Dpctl_INCLUDE_DIR}/..") target_link_libraries(${_trgt} PUBLIC DPCTLSyclInterface) diff --git a/dpnp/backend/extensions/blas/CMakeLists.txt b/dpnp/backend/extensions/blas/CMakeLists.txt index 5960dfcd8028..b4013d82eb40 100644 --- a/dpnp/backend/extensions/blas/CMakeLists.txt +++ b/dpnp/backend/extensions/blas/CMakeLists.txt @@ -39,6 +39,9 @@ set(_module_src pybind11_add_module(${python_module_name} MODULE ${_module_src}) add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_module_src}) +# Ensure Cython modules build first so _usmarray.h exists +add_dependencies(${python_module_name} _usmarray) + if(_dpnp_sycl_targets) # make fat binary target_compile_options( @@ -65,14 +68,20 @@ set_target_properties( target_include_directories( ${python_module_name} - PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../common + PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR}/../common + ${CMAKE_SOURCE_DIR}/dpnp/backend/include + ${CMAKE_SOURCE_DIR}/dpnp/tensor/libtensor/include ) # treat below headers as system to suppress the warnings there during the build target_include_directories( ${python_module_name} SYSTEM - PRIVATE ${SYCL_INCLUDE_DIR} ${Dpctl_INCLUDE_DIRS} ${Dpctl_TENSOR_INCLUDE_DIR} + PRIVATE + ${SYCL_INCLUDE_DIR} + ${Dpctl_INCLUDE_DIRS} + ${CMAKE_BINARY_DIR} # For generated Cython headers ) if(WIN32) diff --git a/dpnp/backend/extensions/blas/dot_common.hpp b/dpnp/backend/extensions/blas/dot_common.hpp index 383804ff1718..d9c3ae7f1c87 100644 --- a/dpnp/backend/extensions/blas/dot_common.hpp +++ b/dpnp/backend/extensions/blas/dot_common.hpp @@ -29,6 +29,7 @@ #pragma once #include + #include // dpctl tensor headers diff --git a/dpnp/backend/extensions/blas/gemm.hpp b/dpnp/backend/extensions/blas/gemm.hpp index 997d515f98a0..59a3d911d885 100644 --- a/dpnp/backend/extensions/blas/gemm.hpp +++ b/dpnp/backend/extensions/blas/gemm.hpp @@ -31,7 +31,7 @@ #include #include -#include +#include "dpnp4pybind11.hpp" namespace dpnp::extensions::blas { diff --git a/dpnp/backend/extensions/blas/gemv.hpp b/dpnp/backend/extensions/blas/gemv.hpp index afe0c6387aa9..6da71ed0964f 100644 --- a/dpnp/backend/extensions/blas/gemv.hpp +++ b/dpnp/backend/extensions/blas/gemv.hpp @@ -31,7 +31,7 @@ #include #include -#include +#include "dpnp4pybind11.hpp" namespace dpnp::extensions::blas { diff --git a/dpnp/backend/extensions/blas/syrk.hpp b/dpnp/backend/extensions/blas/syrk.hpp index 580239b28008..f6cec189489a 100644 --- a/dpnp/backend/extensions/blas/syrk.hpp +++ b/dpnp/backend/extensions/blas/syrk.hpp @@ -31,7 +31,7 @@ #include #include -#include +#include "dpnp4pybind11.hpp" namespace dpnp::extensions::blas { diff --git a/dpnp/backend/extensions/common/ext/common.hpp b/dpnp/backend/extensions/common/ext/common.hpp index f0ce1722bfb1..3c82fb10ec16 100644 --- a/dpnp/backend/extensions/common/ext/common.hpp +++ b/dpnp/backend/extensions/common/ext/common.hpp @@ -29,8 +29,10 @@ #pragma once #include + #include #include + #include // dpctl tensor headers diff --git a/dpnp/backend/extensions/common/ext/details/common_internal.hpp b/dpnp/backend/extensions/common/ext/details/common_internal.hpp index 31d9671a0a43..8db72ce32318 100644 --- a/dpnp/backend/extensions/common/ext/details/common_internal.hpp +++ b/dpnp/backend/extensions/common/ext/details/common_internal.hpp @@ -30,9 +30,11 @@ #include +#include +#include + #include "ext/common.hpp" #include "utils/type_dispatch.hpp" -#include namespace dpctl_td_ns = dpctl::tensor::type_dispatch; diff --git a/dpnp/backend/extensions/common/ext/validation_utils.hpp b/dpnp/backend/extensions/common/ext/validation_utils.hpp index d41db8d5ca5a..03e0718d4450 100644 --- a/dpnp/backend/extensions/common/ext/validation_utils.hpp +++ b/dpnp/backend/extensions/common/ext/validation_utils.hpp @@ -32,7 +32,10 @@ #include #include -#include "dpctl4pybind11.hpp" +#include "dpnp4pybind11.hpp" + +// dpctl tensor headers +#include "utils/type_dispatch.hpp" namespace ext::validation { diff --git a/dpnp/backend/extensions/elementwise_functions/elementwise_functions.hpp b/dpnp/backend/extensions/elementwise_functions/elementwise_functions.hpp index 6a29c9a33c5a..affe2fb5dc49 100644 --- a/dpnp/backend/extensions/elementwise_functions/elementwise_functions.hpp +++ b/dpnp/backend/extensions/elementwise_functions/elementwise_functions.hpp @@ -30,16 +30,17 @@ #include #include +#include #include #include #include -#include - -#include "dpctl4pybind11.hpp" #include #include -#include + +#include + +#include "dpnp4pybind11.hpp" #include "elementwise_functions_type_utils.hpp" #include "simplify_iteration_space.hpp" diff --git a/dpnp/backend/extensions/elementwise_functions/elementwise_functions_type_utils.cpp b/dpnp/backend/extensions/elementwise_functions/elementwise_functions_type_utils.cpp index 62f7584a3e0c..7300f938eabb 100644 --- a/dpnp/backend/extensions/elementwise_functions/elementwise_functions_type_utils.cpp +++ b/dpnp/backend/extensions/elementwise_functions/elementwise_functions_type_utils.cpp @@ -26,12 +26,13 @@ // THE POSSIBILITY OF SUCH DAMAGE. //***************************************************************************** -#include "dpctl4pybind11.hpp" - #include #include + #include +#include "dpnp4pybind11.hpp" + #include "elementwise_functions_type_utils.hpp" // dpctl tensor headers diff --git a/dpnp/backend/extensions/elementwise_functions/elementwise_functions_type_utils.hpp b/dpnp/backend/extensions/elementwise_functions/elementwise_functions_type_utils.hpp index 1bb6fedd7027..58fe43c01589 100644 --- a/dpnp/backend/extensions/elementwise_functions/elementwise_functions_type_utils.hpp +++ b/dpnp/backend/extensions/elementwise_functions/elementwise_functions_type_utils.hpp @@ -28,10 +28,10 @@ #pragma once -#include "dpctl4pybind11.hpp" #include #include -#include + +#include "dpnp4pybind11.hpp" // dpctl tensor headers #include "utils/type_dispatch.hpp" diff --git a/dpnp/backend/extensions/fft/CMakeLists.txt b/dpnp/backend/extensions/fft/CMakeLists.txt index f8f63dd7fd3b..9c452d94bd23 100644 --- a/dpnp/backend/extensions/fft/CMakeLists.txt +++ b/dpnp/backend/extensions/fft/CMakeLists.txt @@ -33,6 +33,9 @@ set(_module_src ${CMAKE_CURRENT_SOURCE_DIR}/fft_py.cpp) pybind11_add_module(${python_module_name} MODULE ${_module_src}) add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_module_src}) +# Ensure Cython modules build first so _usmarray.h exists +add_dependencies(${python_module_name} _usmarray) + if(_dpnp_sycl_targets) # make fat binary target_compile_options( @@ -57,11 +60,21 @@ set_target_properties( PROPERTIES CMAKE_POSITION_INDEPENDENT_CODE ON ) +target_include_directories( + ${python_module_name} + PRIVATE + ${CMAKE_SOURCE_DIR}/dpnp/backend/include + ${CMAKE_SOURCE_DIR}/dpnp/tensor/libtensor/include +) + # treat below headers as system to suppress the warnings there during the build target_include_directories( ${python_module_name} SYSTEM - PRIVATE ${SYCL_INCLUDE_DIR} ${Dpctl_INCLUDE_DIRS} ${Dpctl_TENSOR_INCLUDE_DIR} + PRIVATE + ${SYCL_INCLUDE_DIR} + ${Dpctl_INCLUDE_DIRS} + ${CMAKE_BINARY_DIR} # For generated Cython headers ) if(WIN32) diff --git a/dpnp/backend/extensions/fft/in_place.hpp b/dpnp/backend/extensions/fft/in_place.hpp index 7eed11565b9e..bc35201b9b6e 100644 --- a/dpnp/backend/extensions/fft/in_place.hpp +++ b/dpnp/backend/extensions/fft/in_place.hpp @@ -28,10 +28,13 @@ #pragma once +#include +#include + #include #include -#include +#include "dpnp4pybind11.hpp" namespace dpnp::extensions::fft { diff --git a/dpnp/backend/extensions/fft/in_place.tpp b/dpnp/backend/extensions/fft/in_place.tpp index 4bc166b0e7ae..ace535284de6 100644 --- a/dpnp/backend/extensions/fft/in_place.tpp +++ b/dpnp/backend/extensions/fft/in_place.tpp @@ -27,15 +27,23 @@ //***************************************************************************** #pragma once + #include +#include +#include +#include + +#include #include #include -#include +#include "dpnp4pybind11.hpp" #include "common.hpp" #include "fft_utils.hpp" +#include "in_place.hpp" + // dpctl tensor headers #include "utils/output_validation.hpp" diff --git a/dpnp/backend/extensions/fft/out_of_place.hpp b/dpnp/backend/extensions/fft/out_of_place.hpp index 811a2bd6d1c4..55ca9383baaf 100644 --- a/dpnp/backend/extensions/fft/out_of_place.hpp +++ b/dpnp/backend/extensions/fft/out_of_place.hpp @@ -28,10 +28,13 @@ #pragma once +#include +#include + #include #include -#include +#include "dpnp4pybind11.hpp" namespace dpnp::extensions::fft { diff --git a/dpnp/backend/extensions/fft/out_of_place.tpp b/dpnp/backend/extensions/fft/out_of_place.tpp index ed5cd37df7f1..aada49c16bda 100644 --- a/dpnp/backend/extensions/fft/out_of_place.tpp +++ b/dpnp/backend/extensions/fft/out_of_place.tpp @@ -27,15 +27,25 @@ //***************************************************************************** #pragma once + +#include +#include #include +#include +#include +#include #include #include -#include +#include + +#include "dpnp4pybind11.hpp" #include "common.hpp" #include "fft_utils.hpp" +#include "out_of_place.hpp" + // dpctl tensor headers #include "utils/memory_overlap.hpp" #include "utils/output_validation.hpp" diff --git a/dpnp/backend/extensions/indexing/CMakeLists.txt b/dpnp/backend/extensions/indexing/CMakeLists.txt index e1bc34c9ae8b..ce800a87124c 100644 --- a/dpnp/backend/extensions/indexing/CMakeLists.txt +++ b/dpnp/backend/extensions/indexing/CMakeLists.txt @@ -36,6 +36,9 @@ set(_module_src pybind11_add_module(${python_module_name} MODULE ${_module_src}) add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_module_src}) +# Ensure Cython modules build first so _usmarray.h exists +add_dependencies(${python_module_name} _usmarray) + if(_dpnp_sycl_targets) # make fat binary target_compile_options( @@ -62,14 +65,21 @@ set_target_properties( target_include_directories( ${python_module_name} - PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../../ ${CMAKE_CURRENT_SOURCE_DIR}/../common + PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR}/../../ + ${CMAKE_CURRENT_SOURCE_DIR}/../common + ${CMAKE_SOURCE_DIR}/dpnp/backend/include + ${CMAKE_SOURCE_DIR}/dpnp/tensor/libtensor/include ) # treat below headers as system to suppress the warnings there during the build target_include_directories( ${python_module_name} SYSTEM - PRIVATE ${SYCL_INCLUDE_DIR} ${Dpctl_INCLUDE_DIRS} ${Dpctl_TENSOR_INCLUDE_DIR} + PRIVATE + ${SYCL_INCLUDE_DIR} + ${Dpctl_INCLUDE_DIRS} + ${CMAKE_BINARY_DIR} # For generated Cython headers ) if(WIN32) diff --git a/dpnp/backend/extensions/indexing/choose.cpp b/dpnp/backend/extensions/indexing/choose.cpp index 3b2df73f46ef..fafcbe1f2495 100644 --- a/dpnp/backend/extensions/indexing/choose.cpp +++ b/dpnp/backend/extensions/indexing/choose.cpp @@ -39,10 +39,11 @@ #include -#include "dpctl4pybind11.hpp" #include #include +#include "dpnp4pybind11.hpp" + #include "ext/common.hpp" #include "kernels/indexing/choose.hpp" diff --git a/dpnp/backend/extensions/lapack/CMakeLists.txt b/dpnp/backend/extensions/lapack/CMakeLists.txt index 6dee8abebeca..6c898df05236 100644 --- a/dpnp/backend/extensions/lapack/CMakeLists.txt +++ b/dpnp/backend/extensions/lapack/CMakeLists.txt @@ -56,6 +56,9 @@ set(_module_src pybind11_add_module(${python_module_name} MODULE ${_module_src}) add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_module_src}) +# Ensure Cython modules build first so _usmarray.h exists +add_dependencies(${python_module_name} _usmarray) + if(_dpnp_sycl_targets) # make fat binary target_compile_options( @@ -82,14 +85,20 @@ set_target_properties( target_include_directories( ${python_module_name} - PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../common + PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR}/../common + ${CMAKE_SOURCE_DIR}/dpnp/backend/include + ${CMAKE_SOURCE_DIR}/dpnp/tensor/libtensor/include ) # treat below headers as system to suppress the warnings there during the build target_include_directories( ${python_module_name} SYSTEM - PRIVATE ${SYCL_INCLUDE_DIR} ${Dpctl_INCLUDE_DIRS} ${Dpctl_TENSOR_INCLUDE_DIR} + PRIVATE + ${SYCL_INCLUDE_DIR} + ${Dpctl_INCLUDE_DIRS} + ${CMAKE_BINARY_DIR} # For generated Cython headers ) if(WIN32) diff --git a/dpnp/backend/extensions/lapack/geqrf.hpp b/dpnp/backend/extensions/lapack/geqrf.hpp index 522006ace8ab..7be1fee971cf 100644 --- a/dpnp/backend/extensions/lapack/geqrf.hpp +++ b/dpnp/backend/extensions/lapack/geqrf.hpp @@ -31,7 +31,7 @@ #include #include -#include +#include "dpnp4pybind11.hpp" namespace dpnp::extensions::lapack { diff --git a/dpnp/backend/extensions/lapack/gesv.hpp b/dpnp/backend/extensions/lapack/gesv.hpp index d4198efae62e..a86039c9b72e 100644 --- a/dpnp/backend/extensions/lapack/gesv.hpp +++ b/dpnp/backend/extensions/lapack/gesv.hpp @@ -31,7 +31,7 @@ #include #include -#include +#include "dpnp4pybind11.hpp" namespace dpnp::extensions::lapack { diff --git a/dpnp/backend/extensions/lapack/gesvd.hpp b/dpnp/backend/extensions/lapack/gesvd.hpp index 116348e01d9f..b2fea5e47299 100644 --- a/dpnp/backend/extensions/lapack/gesvd.hpp +++ b/dpnp/backend/extensions/lapack/gesvd.hpp @@ -31,7 +31,7 @@ #include #include -#include +#include "dpnp4pybind11.hpp" namespace dpnp::extensions::lapack { diff --git a/dpnp/backend/extensions/lapack/getrf.hpp b/dpnp/backend/extensions/lapack/getrf.hpp index 24ec473f4dc7..ce6dc3e788b5 100644 --- a/dpnp/backend/extensions/lapack/getrf.hpp +++ b/dpnp/backend/extensions/lapack/getrf.hpp @@ -31,7 +31,7 @@ #include #include -#include +#include "dpnp4pybind11.hpp" namespace dpnp::extensions::lapack { diff --git a/dpnp/backend/extensions/lapack/getri.hpp b/dpnp/backend/extensions/lapack/getri.hpp index d8c8e58f3fcb..728af4a77e01 100644 --- a/dpnp/backend/extensions/lapack/getri.hpp +++ b/dpnp/backend/extensions/lapack/getri.hpp @@ -31,7 +31,7 @@ #include #include -#include +#include "dpnp4pybind11.hpp" namespace dpnp::extensions::lapack { diff --git a/dpnp/backend/extensions/lapack/getrs.hpp b/dpnp/backend/extensions/lapack/getrs.hpp index f5a47c69c9ec..2728b0c4e04a 100644 --- a/dpnp/backend/extensions/lapack/getrs.hpp +++ b/dpnp/backend/extensions/lapack/getrs.hpp @@ -31,7 +31,7 @@ #include #include -#include +#include "dpnp4pybind11.hpp" namespace dpnp::extensions::lapack { diff --git a/dpnp/backend/extensions/lapack/heevd.cpp b/dpnp/backend/extensions/lapack/heevd.cpp index 96d6a03e9b8e..ecad85f468ef 100644 --- a/dpnp/backend/extensions/lapack/heevd.cpp +++ b/dpnp/backend/extensions/lapack/heevd.cpp @@ -28,6 +28,7 @@ #include +#include #include #include "evd_common.hpp" diff --git a/dpnp/backend/extensions/lapack/heevd_batch.cpp b/dpnp/backend/extensions/lapack/heevd_batch.cpp index e8614498bd41..54521136127a 100644 --- a/dpnp/backend/extensions/lapack/heevd_batch.cpp +++ b/dpnp/backend/extensions/lapack/heevd_batch.cpp @@ -28,6 +28,7 @@ #include +#include #include #include "common_helpers.hpp" diff --git a/dpnp/backend/extensions/lapack/orgqr.hpp b/dpnp/backend/extensions/lapack/orgqr.hpp index 962edc7b668f..2502fe567a1f 100644 --- a/dpnp/backend/extensions/lapack/orgqr.hpp +++ b/dpnp/backend/extensions/lapack/orgqr.hpp @@ -31,7 +31,7 @@ #include #include -#include +#include "dpnp4pybind11.hpp" namespace dpnp::extensions::lapack { diff --git a/dpnp/backend/extensions/lapack/potrf.hpp b/dpnp/backend/extensions/lapack/potrf.hpp index d5df48a9ddf4..02faf2c04fde 100644 --- a/dpnp/backend/extensions/lapack/potrf.hpp +++ b/dpnp/backend/extensions/lapack/potrf.hpp @@ -31,7 +31,7 @@ #include #include -#include +#include "dpnp4pybind11.hpp" namespace dpnp::extensions::lapack { diff --git a/dpnp/backend/extensions/lapack/syevd.cpp b/dpnp/backend/extensions/lapack/syevd.cpp index 3ecd386299ac..60dae80e90c6 100644 --- a/dpnp/backend/extensions/lapack/syevd.cpp +++ b/dpnp/backend/extensions/lapack/syevd.cpp @@ -28,6 +28,7 @@ #include +#include #include #include "evd_common.hpp" diff --git a/dpnp/backend/extensions/lapack/syevd_batch.cpp b/dpnp/backend/extensions/lapack/syevd_batch.cpp index 13237d27a35c..884b6045f418 100644 --- a/dpnp/backend/extensions/lapack/syevd_batch.cpp +++ b/dpnp/backend/extensions/lapack/syevd_batch.cpp @@ -28,6 +28,7 @@ #include +#include #include #include "common_helpers.hpp" diff --git a/dpnp/backend/extensions/lapack/ungqr.hpp b/dpnp/backend/extensions/lapack/ungqr.hpp index a149af1e24e1..8c9a36b3f4a6 100644 --- a/dpnp/backend/extensions/lapack/ungqr.hpp +++ b/dpnp/backend/extensions/lapack/ungqr.hpp @@ -31,7 +31,7 @@ #include #include -#include +#include "dpnp4pybind11.hpp" namespace dpnp::extensions::lapack { diff --git a/dpnp/backend/extensions/statistics/CMakeLists.txt b/dpnp/backend/extensions/statistics/CMakeLists.txt index 36786c8cbaf3..434d223de3ab 100644 --- a/dpnp/backend/extensions/statistics/CMakeLists.txt +++ b/dpnp/backend/extensions/statistics/CMakeLists.txt @@ -41,6 +41,9 @@ set(_module_src pybind11_add_module(${python_module_name} MODULE ${_module_src}) add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_module_src}) +# Ensure Cython modules build first so _usmarray.h exists +add_dependencies(${python_module_name} _usmarray) + if(_dpnp_sycl_targets) # make fat binary target_compile_options( @@ -67,14 +70,21 @@ set_target_properties( target_include_directories( ${python_module_name} - PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../../ ${CMAKE_CURRENT_SOURCE_DIR}/../common + PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR}/../../ + ${CMAKE_CURRENT_SOURCE_DIR}/../common + ${CMAKE_SOURCE_DIR}/dpnp/backend/include + ${CMAKE_SOURCE_DIR}/dpnp/tensor/libtensor/include ) # treat below headers as system to suppress the warnings there during the build target_include_directories( ${python_module_name} SYSTEM - PRIVATE ${SYCL_INCLUDE_DIR} ${Dpctl_INCLUDE_DIRS} ${Dpctl_TENSOR_INCLUDE_DIR} + PRIVATE + ${SYCL_INCLUDE_DIR} + ${Dpctl_INCLUDE_DIRS} + ${CMAKE_BINARY_DIR} # For generated Cython headers ) if(WIN32) diff --git a/dpnp/backend/extensions/statistics/bincount.hpp b/dpnp/backend/extensions/statistics/bincount.hpp index 5e42952349b0..2fc477e71edc 100644 --- a/dpnp/backend/extensions/statistics/bincount.hpp +++ b/dpnp/backend/extensions/statistics/bincount.hpp @@ -31,7 +31,8 @@ #include #include -#include "dpctl4pybind11.hpp" +#include "dpnp4pybind11.hpp" + #include "ext/dispatch_table.hpp" namespace dpctl_td_ns = dpctl::tensor::type_dispatch; diff --git a/dpnp/backend/extensions/statistics/histogram.cpp b/dpnp/backend/extensions/statistics/histogram.cpp index 6d7da6836f60..afc5d9638f48 100644 --- a/dpnp/backend/extensions/statistics/histogram.cpp +++ b/dpnp/backend/extensions/statistics/histogram.cpp @@ -35,8 +35,9 @@ #include #include +#include "dpnp4pybind11.hpp" + // dpctl tensor headers -#include "dpctl4pybind11.hpp" #include "utils/type_dispatch.hpp" #include "histogram.hpp" @@ -50,7 +51,6 @@ using namespace ext::common; namespace { - template struct HistogramEdges { diff --git a/dpnp/backend/extensions/statistics/histogram.hpp b/dpnp/backend/extensions/statistics/histogram.hpp index c6a79ec24ee3..d04d8edbf02b 100644 --- a/dpnp/backend/extensions/statistics/histogram.hpp +++ b/dpnp/backend/extensions/statistics/histogram.hpp @@ -31,7 +31,9 @@ #include #include -#include "dpctl4pybind11.hpp" +#include "dpnp4pybind11.hpp" + +// utils extension header #include "ext/dispatch_table.hpp" namespace statistics::histogram diff --git a/dpnp/backend/extensions/statistics/histogram_common.cpp b/dpnp/backend/extensions/statistics/histogram_common.cpp index 82afa2bd965d..252e1cd7c7cc 100644 --- a/dpnp/backend/extensions/statistics/histogram_common.cpp +++ b/dpnp/backend/extensions/statistics/histogram_common.cpp @@ -31,15 +31,18 @@ #include #include -#include "dpctl4pybind11.hpp" -#include "utils/type_dispatch.hpp" - #include +#include "dpnp4pybind11.hpp" + #include "histogram_common.hpp" +// utils extension header #include "ext/validation_utils.hpp" +// dpctl tensor headers +#include "utils/type_dispatch.hpp" + namespace dpctl_td_ns = dpctl::tensor::type_dispatch; using dpctl::tensor::usm_ndarray; using dpctl_td_ns::typenum_t; @@ -57,7 +60,6 @@ using ext::validation::name_of; namespace statistics::histogram { - void validate(const usm_ndarray &sample, const std::optional &bins, const std::optional &weights, diff --git a/dpnp/backend/extensions/statistics/histogram_common.hpp b/dpnp/backend/extensions/statistics/histogram_common.hpp index 8091e8874d17..47fef11061f3 100644 --- a/dpnp/backend/extensions/statistics/histogram_common.hpp +++ b/dpnp/backend/extensions/statistics/histogram_common.hpp @@ -35,7 +35,7 @@ #include -#include "dpctl4pybind11.hpp" +#include "dpnp4pybind11.hpp" #include "ext/common.hpp" #include "kernels/statistics/histogram.hpp" diff --git a/dpnp/backend/extensions/statistics/histogramdd.hpp b/dpnp/backend/extensions/statistics/histogramdd.hpp index 327e9941dbc6..d7c46ae34b7d 100644 --- a/dpnp/backend/extensions/statistics/histogramdd.hpp +++ b/dpnp/backend/extensions/statistics/histogramdd.hpp @@ -31,7 +31,9 @@ #include #include -#include "dpctl4pybind11.hpp" +#include "dpnp4pybind11.hpp" + +// utils extension header #include "ext/dispatch_table.hpp" namespace statistics::histogram diff --git a/dpnp/backend/extensions/statistics/sliding_dot_product1d.cpp b/dpnp/backend/extensions/statistics/sliding_dot_product1d.cpp index b8f679f1030e..6c0e39a11a19 100644 --- a/dpnp/backend/extensions/statistics/sliding_dot_product1d.cpp +++ b/dpnp/backend/extensions/statistics/sliding_dot_product1d.cpp @@ -33,11 +33,14 @@ #include #include +#include "dpnp4pybind11.hpp" + +// utils extension header +#include "ext/common.hpp" + // dpctl tensor headers -#include "dpctl4pybind11.hpp" #include "utils/type_dispatch.hpp" -#include "ext/common.hpp" #include "sliding_dot_product1d.hpp" #include "sliding_window1d.hpp" @@ -51,7 +54,6 @@ using namespace ext::common; namespace { - template struct SlidingDotProductF { diff --git a/dpnp/backend/extensions/statistics/sliding_window1d.cpp b/dpnp/backend/extensions/statistics/sliding_window1d.cpp index 3ae66daa332b..81f8ae40104e 100644 --- a/dpnp/backend/extensions/statistics/sliding_window1d.cpp +++ b/dpnp/backend/extensions/statistics/sliding_window1d.cpp @@ -29,11 +29,16 @@ #include #include -#include "dpctl4pybind11.hpp" -#include "utils/type_dispatch.hpp" #include +#include "dpnp4pybind11.hpp" + +// utils extension header #include "ext/validation_utils.hpp" + +// dpctl tensor headers +#include "utils/type_dispatch.hpp" + #include "sliding_window1d.hpp" namespace dpctl_td_ns = dpctl::tensor::type_dispatch; @@ -48,7 +53,6 @@ using ext::validation::name_of; namespace statistics::sliding_window1d { - void validate(const usm_ndarray &a, const usm_ndarray &v, const usm_ndarray &out, @@ -89,5 +93,4 @@ void validate(const usm_ndarray &a, std::to_string(expected_output_size) + ")"); } } - } // namespace statistics::sliding_window1d diff --git a/dpnp/backend/extensions/statistics/sliding_window1d.hpp b/dpnp/backend/extensions/statistics/sliding_window1d.hpp index 329c96dfc1c6..a13c1f873e78 100644 --- a/dpnp/backend/extensions/statistics/sliding_window1d.hpp +++ b/dpnp/backend/extensions/statistics/sliding_window1d.hpp @@ -34,7 +34,7 @@ #include -#include "dpctl4pybind11.hpp" +#include "dpnp4pybind11.hpp" #include "kernels/statistics/sliding_window1d.hpp" diff --git a/dpnp/backend/extensions/ufunc/CMakeLists.txt b/dpnp/backend/extensions/ufunc/CMakeLists.txt index ae6015e11d0f..2b01823d01f3 100644 --- a/dpnp/backend/extensions/ufunc/CMakeLists.txt +++ b/dpnp/backend/extensions/ufunc/CMakeLists.txt @@ -67,6 +67,9 @@ set(_module_src pybind11_add_module(${python_module_name} MODULE ${_module_src}) add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_module_src}) +# Ensure Cython modules build first so _usmarray.h exists +add_dependencies(${python_module_name} _usmarray) + if(WIN32) if(${CMAKE_VERSION} VERSION_LESS "3.27") # this is a work-around for target_link_options inserting option after -link option, cause @@ -84,14 +87,21 @@ set_target_properties( target_include_directories( ${python_module_name} - PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../../ ${CMAKE_CURRENT_SOURCE_DIR}/../common + PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR}/../../ + ${CMAKE_CURRENT_SOURCE_DIR}/../common + ${CMAKE_SOURCE_DIR}/dpnp/backend/include + ${CMAKE_SOURCE_DIR}/dpnp/tensor/libtensor/include ) # treat below headers as system to suppress the warnings there during the build target_include_directories( ${python_module_name} SYSTEM - PRIVATE ${SYCL_INCLUDE_DIR} ${Dpctl_INCLUDE_DIRS} ${Dpctl_TENSOR_INCLUDE_DIR} + PRIVATE + ${SYCL_INCLUDE_DIR} + ${Dpctl_INCLUDE_DIRS} + ${CMAKE_BINARY_DIR} # For generated Cython headers ) if(_dpnp_sycl_targets) diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/bitwise_count.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/bitwise_count.cpp index a0842f4ef259..761bd330a326 100644 --- a/dpnp/backend/extensions/ufunc/elementwise_functions/bitwise_count.cpp +++ b/dpnp/backend/extensions/ufunc/elementwise_functions/bitwise_count.cpp @@ -30,9 +30,13 @@ #include #include +#include +#include +#include + #include -#include "dpctl4pybind11.hpp" +#include "dpnp4pybind11.hpp" #include "bitwise_count.hpp" #include "kernels/elementwise_functions/bitwise_count.hpp" diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/degrees.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/degrees.cpp index 77452a6b777f..729fcb576c77 100644 --- a/dpnp/backend/extensions/ufunc/elementwise_functions/degrees.cpp +++ b/dpnp/backend/extensions/ufunc/elementwise_functions/degrees.cpp @@ -29,9 +29,13 @@ #include #include +#include +#include +#include + #include -#include "dpctl4pybind11.hpp" +#include "dpnp4pybind11.hpp" #include "degrees.hpp" #include "kernels/elementwise_functions/degrees.hpp" diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/divmod.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/divmod.cpp index af87dcc85f53..1bb3859a39f4 100644 --- a/dpnp/backend/extensions/ufunc/elementwise_functions/divmod.cpp +++ b/dpnp/backend/extensions/ufunc/elementwise_functions/divmod.cpp @@ -30,9 +30,13 @@ #include #include +#include +#include +#include + #include -#include "dpctl4pybind11.hpp" +#include "dpnp4pybind11.hpp" #include "divmod.hpp" #include "kernels/elementwise_functions/divmod.hpp" diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/erf_funcs.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/erf_funcs.cpp index 6f10e651fe25..c07989939b70 100644 --- a/dpnp/backend/extensions/ufunc/elementwise_functions/erf_funcs.cpp +++ b/dpnp/backend/extensions/ufunc/elementwise_functions/erf_funcs.cpp @@ -29,9 +29,13 @@ #include #include +#include +#include +#include + #include -#include "dpctl4pybind11.hpp" +#include "dpnp4pybind11.hpp" #include "erf_funcs.hpp" #include "kernels/elementwise_functions/erf.hpp" diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/fabs.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/fabs.cpp index d2b6ae24ac4b..f7c2183633af 100644 --- a/dpnp/backend/extensions/ufunc/elementwise_functions/fabs.cpp +++ b/dpnp/backend/extensions/ufunc/elementwise_functions/fabs.cpp @@ -29,9 +29,13 @@ #include #include +#include +#include +#include + #include -#include "dpctl4pybind11.hpp" +#include "dpnp4pybind11.hpp" #include "fabs.hpp" #include "kernels/elementwise_functions/fabs.hpp" diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/float_power.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/float_power.cpp index 0994afc7c738..43927eb93806 100644 --- a/dpnp/backend/extensions/ufunc/elementwise_functions/float_power.cpp +++ b/dpnp/backend/extensions/ufunc/elementwise_functions/float_power.cpp @@ -30,9 +30,13 @@ #include #include +#include +#include +#include + #include -#include "dpctl4pybind11.hpp" +#include "dpnp4pybind11.hpp" #include "float_power.hpp" diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/fmax.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/fmax.cpp index 5e1a9f33444b..9471feaf2166 100644 --- a/dpnp/backend/extensions/ufunc/elementwise_functions/fmax.cpp +++ b/dpnp/backend/extensions/ufunc/elementwise_functions/fmax.cpp @@ -28,9 +28,13 @@ #include +#include +#include +#include + #include -#include "dpctl4pybind11.hpp" +#include "dpnp4pybind11.hpp" #include "fmax.hpp" #include "kernels/elementwise_functions/fmax.hpp" diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/fmin.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/fmin.cpp index c0e1db654317..8e279897f414 100644 --- a/dpnp/backend/extensions/ufunc/elementwise_functions/fmin.cpp +++ b/dpnp/backend/extensions/ufunc/elementwise_functions/fmin.cpp @@ -28,9 +28,13 @@ #include +#include +#include +#include + #include -#include "dpctl4pybind11.hpp" +#include "dpnp4pybind11.hpp" #include "fmin.hpp" #include "kernels/elementwise_functions/fmin.hpp" diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/fmod.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/fmod.cpp index 5b83595b3f7c..83fb750b6907 100644 --- a/dpnp/backend/extensions/ufunc/elementwise_functions/fmod.cpp +++ b/dpnp/backend/extensions/ufunc/elementwise_functions/fmod.cpp @@ -30,9 +30,13 @@ #include #include +#include +#include +#include + #include -#include "dpctl4pybind11.hpp" +#include "dpnp4pybind11.hpp" #include "fmod.hpp" #include "kernels/elementwise_functions/fmod.hpp" diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/frexp.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/frexp.cpp index 4439f1e76993..17e09f3ee816 100644 --- a/dpnp/backend/extensions/ufunc/elementwise_functions/frexp.cpp +++ b/dpnp/backend/extensions/ufunc/elementwise_functions/frexp.cpp @@ -31,9 +31,13 @@ #include #include +#include +#include +#include + #include -#include "dpctl4pybind11.hpp" +#include "dpnp4pybind11.hpp" #include "frexp.hpp" #include "kernels/elementwise_functions/frexp.hpp" diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/gcd.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/gcd.cpp index ec10504fa15e..0481365356ca 100644 --- a/dpnp/backend/extensions/ufunc/elementwise_functions/gcd.cpp +++ b/dpnp/backend/extensions/ufunc/elementwise_functions/gcd.cpp @@ -30,9 +30,13 @@ #include #include +#include +#include +#include + #include -#include "dpctl4pybind11.hpp" +#include "dpnp4pybind11.hpp" #include "gcd.hpp" #include "kernels/elementwise_functions/gcd.hpp" diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/heaviside.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/heaviside.cpp index e3212de86f7f..62affd206420 100644 --- a/dpnp/backend/extensions/ufunc/elementwise_functions/heaviside.cpp +++ b/dpnp/backend/extensions/ufunc/elementwise_functions/heaviside.cpp @@ -29,9 +29,13 @@ #include #include +#include +#include +#include + #include -#include "dpctl4pybind11.hpp" +#include "dpnp4pybind11.hpp" #include "heaviside.hpp" #include "kernels/elementwise_functions/heaviside.hpp" diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/i0.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/i0.cpp index 4d120a56e837..53ded341b58b 100644 --- a/dpnp/backend/extensions/ufunc/elementwise_functions/i0.cpp +++ b/dpnp/backend/extensions/ufunc/elementwise_functions/i0.cpp @@ -29,9 +29,13 @@ #include #include +#include +#include +#include + #include -#include "dpctl4pybind11.hpp" +#include "dpnp4pybind11.hpp" #include "i0.hpp" #include "kernels/elementwise_functions/i0.hpp" diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/interpolate.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/interpolate.cpp index 8830569ce9cf..36dae50e7b2c 100644 --- a/dpnp/backend/extensions/ufunc/elementwise_functions/interpolate.cpp +++ b/dpnp/backend/extensions/ufunc/elementwise_functions/interpolate.cpp @@ -35,12 +35,14 @@ #include #include -#include - -#include "dpctl4pybind11.hpp" +#include #include #include +#include + +#include "dpnp4pybind11.hpp" + #include "kernels/elementwise_functions/interpolate.hpp" // dpctl tensor headers diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/isclose.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/isclose.cpp index b8179feb9263..3025cbf16586 100644 --- a/dpnp/backend/extensions/ufunc/elementwise_functions/isclose.cpp +++ b/dpnp/backend/extensions/ufunc/elementwise_functions/isclose.cpp @@ -32,12 +32,14 @@ #include #include -#include - -#include "dpctl4pybind11.hpp" +#include #include #include +#include + +#include "dpnp4pybind11.hpp" + #include "kernels/elementwise_functions/isclose.hpp" #include "../../elementwise_functions/simplify_iteration_space.hpp" diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/lcm.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/lcm.cpp index 4276ceb6b246..35138e903eac 100644 --- a/dpnp/backend/extensions/ufunc/elementwise_functions/lcm.cpp +++ b/dpnp/backend/extensions/ufunc/elementwise_functions/lcm.cpp @@ -30,9 +30,13 @@ #include #include +#include +#include +#include + #include -#include "dpctl4pybind11.hpp" +#include "dpnp4pybind11.hpp" #include "kernels/elementwise_functions/lcm.hpp" #include "lcm.hpp" diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/ldexp.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/ldexp.cpp index 3e2c4f3d0149..44ef51726a6a 100644 --- a/dpnp/backend/extensions/ufunc/elementwise_functions/ldexp.cpp +++ b/dpnp/backend/extensions/ufunc/elementwise_functions/ldexp.cpp @@ -30,9 +30,13 @@ #include #include +#include +#include +#include + #include -#include "dpctl4pybind11.hpp" +#include "dpnp4pybind11.hpp" #include "kernels/elementwise_functions/ldexp.hpp" #include "ldexp.hpp" diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/logaddexp2.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/logaddexp2.cpp index 57c7c60ca9cf..e37f13b119d6 100644 --- a/dpnp/backend/extensions/ufunc/elementwise_functions/logaddexp2.cpp +++ b/dpnp/backend/extensions/ufunc/elementwise_functions/logaddexp2.cpp @@ -28,9 +28,13 @@ #include +#include +#include +#include + #include -#include "dpctl4pybind11.hpp" +#include "dpnp4pybind11.hpp" #include "kernels/elementwise_functions/logaddexp2.hpp" #include "logaddexp2.hpp" diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/modf.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/modf.cpp index f8aab23d5630..266103248521 100644 --- a/dpnp/backend/extensions/ufunc/elementwise_functions/modf.cpp +++ b/dpnp/backend/extensions/ufunc/elementwise_functions/modf.cpp @@ -31,9 +31,13 @@ #include #include +#include +#include +#include + #include -#include "dpctl4pybind11.hpp" +#include "dpnp4pybind11.hpp" #include "kernels/elementwise_functions/modf.hpp" #include "modf.hpp" diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/nan_to_num.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/nan_to_num.cpp index 2490f1921a98..c30d388f8afd 100644 --- a/dpnp/backend/extensions/ufunc/elementwise_functions/nan_to_num.cpp +++ b/dpnp/backend/extensions/ufunc/elementwise_functions/nan_to_num.cpp @@ -38,11 +38,12 @@ #include -#include "dpctl4pybind11.hpp" #include #include #include +#include "dpnp4pybind11.hpp" + #include "kernels/elementwise_functions/nan_to_num.hpp" #include "../../elementwise_functions/simplify_iteration_space.hpp" diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/radians.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/radians.cpp index 7fc8ae5331dd..0a481fd33d11 100644 --- a/dpnp/backend/extensions/ufunc/elementwise_functions/radians.cpp +++ b/dpnp/backend/extensions/ufunc/elementwise_functions/radians.cpp @@ -29,9 +29,13 @@ #include #include +#include +#include +#include + #include -#include "dpctl4pybind11.hpp" +#include "dpnp4pybind11.hpp" #include "kernels/elementwise_functions/radians.hpp" #include "populate.hpp" diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/sinc.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/sinc.cpp index abd02e1e6282..87a911472db2 100644 --- a/dpnp/backend/extensions/ufunc/elementwise_functions/sinc.cpp +++ b/dpnp/backend/extensions/ufunc/elementwise_functions/sinc.cpp @@ -30,9 +30,13 @@ #include #include +#include +#include +#include + #include -#include "dpctl4pybind11.hpp" +#include "dpnp4pybind11.hpp" #include "kernels/elementwise_functions/sinc.hpp" #include "populate.hpp" diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/spacing.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/spacing.cpp index 6e401c5388dd..4c14582f30ae 100644 --- a/dpnp/backend/extensions/ufunc/elementwise_functions/spacing.cpp +++ b/dpnp/backend/extensions/ufunc/elementwise_functions/spacing.cpp @@ -29,9 +29,13 @@ #include #include +#include +#include +#include + #include -#include "dpctl4pybind11.hpp" +#include "dpnp4pybind11.hpp" #include "kernels/elementwise_functions/spacing.hpp" #include "populate.hpp" diff --git a/dpnp/backend/extensions/vm/CMakeLists.txt b/dpnp/backend/extensions/vm/CMakeLists.txt index 7165f7b926fb..05aa64e0d814 100644 --- a/dpnp/backend/extensions/vm/CMakeLists.txt +++ b/dpnp/backend/extensions/vm/CMakeLists.txt @@ -90,6 +90,9 @@ set(python_module_name _vm_impl) pybind11_add_module(${python_module_name} MODULE ${_module_src}) add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_module_src}) +# Ensure Cython modules build first so _usmarray.h exists +add_dependencies(${python_module_name} _usmarray) + if(WIN32) if(${CMAKE_VERSION} VERSION_LESS "3.27") # this is a work-around for target_link_options inserting option after -link option, cause @@ -107,14 +110,20 @@ set_target_properties( target_include_directories( ${python_module_name} - PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../common + PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR}/../common + ${CMAKE_SOURCE_DIR}/dpnp/backend/include + ${CMAKE_SOURCE_DIR}/dpnp/tensor/libtensor/include ) # treat below headers as system to suppress the warnings there during the build target_include_directories( ${python_module_name} SYSTEM - PRIVATE ${SYCL_INCLUDE_DIR} ${Dpctl_INCLUDE_DIRS} ${Dpctl_TENSOR_INCLUDE_DIR} + PRIVATE + ${SYCL_INCLUDE_DIR} + ${Dpctl_INCLUDE_DIRS} + ${CMAKE_BINARY_DIR} # For generated Cython headers ) if(WIN32) diff --git a/dpnp/backend/extensions/vm/abs.cpp b/dpnp/backend/extensions/vm/abs.cpp index 133f3077ac43..1dc8143dd5ff 100644 --- a/dpnp/backend/extensions/vm/abs.cpp +++ b/dpnp/backend/extensions/vm/abs.cpp @@ -35,7 +35,10 @@ #include #include -#include "dpctl4pybind11.hpp" +#include +#include + +#include "dpnp4pybind11.hpp" #include "abs.hpp" #include "common.hpp" diff --git a/dpnp/backend/extensions/vm/acos.cpp b/dpnp/backend/extensions/vm/acos.cpp index 0cb9bb32f4b8..15b4ce80cc3c 100644 --- a/dpnp/backend/extensions/vm/acos.cpp +++ b/dpnp/backend/extensions/vm/acos.cpp @@ -35,7 +35,10 @@ #include #include -#include "dpctl4pybind11.hpp" +#include +#include + +#include "dpnp4pybind11.hpp" #include "acos.hpp" #include "common.hpp" diff --git a/dpnp/backend/extensions/vm/acosh.cpp b/dpnp/backend/extensions/vm/acosh.cpp index fa25ecf5cc1e..eed835b78e10 100644 --- a/dpnp/backend/extensions/vm/acosh.cpp +++ b/dpnp/backend/extensions/vm/acosh.cpp @@ -35,7 +35,10 @@ #include #include -#include "dpctl4pybind11.hpp" +#include +#include + +#include "dpnp4pybind11.hpp" #include "acosh.hpp" #include "common.hpp" diff --git a/dpnp/backend/extensions/vm/add.cpp b/dpnp/backend/extensions/vm/add.cpp index 165671c93415..a58aac727cd1 100644 --- a/dpnp/backend/extensions/vm/add.cpp +++ b/dpnp/backend/extensions/vm/add.cpp @@ -36,7 +36,10 @@ #include #include -#include "dpctl4pybind11.hpp" +#include +#include + +#include "dpnp4pybind11.hpp" #include "add.hpp" #include "common.hpp" diff --git a/dpnp/backend/extensions/vm/arg.cpp b/dpnp/backend/extensions/vm/arg.cpp index e062f1f2ee06..c50c4a33dee1 100644 --- a/dpnp/backend/extensions/vm/arg.cpp +++ b/dpnp/backend/extensions/vm/arg.cpp @@ -35,7 +35,10 @@ #include #include -#include "dpctl4pybind11.hpp" +#include +#include + +#include "dpnp4pybind11.hpp" #include "arg.hpp" #include "common.hpp" diff --git a/dpnp/backend/extensions/vm/asin.cpp b/dpnp/backend/extensions/vm/asin.cpp index 8a2e1c079ed8..5af7033fed21 100644 --- a/dpnp/backend/extensions/vm/asin.cpp +++ b/dpnp/backend/extensions/vm/asin.cpp @@ -35,7 +35,10 @@ #include #include -#include "dpctl4pybind11.hpp" +#include +#include + +#include "dpnp4pybind11.hpp" #include "asin.hpp" #include "common.hpp" diff --git a/dpnp/backend/extensions/vm/asinh.cpp b/dpnp/backend/extensions/vm/asinh.cpp index 176bacdb92a8..5b0f8ed13106 100644 --- a/dpnp/backend/extensions/vm/asinh.cpp +++ b/dpnp/backend/extensions/vm/asinh.cpp @@ -35,7 +35,10 @@ #include #include -#include "dpctl4pybind11.hpp" +#include +#include + +#include "dpnp4pybind11.hpp" #include "asinh.hpp" #include "common.hpp" diff --git a/dpnp/backend/extensions/vm/atan.cpp b/dpnp/backend/extensions/vm/atan.cpp index 21c8c8f1c9d5..2255000c1c4b 100644 --- a/dpnp/backend/extensions/vm/atan.cpp +++ b/dpnp/backend/extensions/vm/atan.cpp @@ -35,7 +35,10 @@ #include #include -#include "dpctl4pybind11.hpp" +#include +#include + +#include "dpnp4pybind11.hpp" #include "atan.hpp" #include "common.hpp" diff --git a/dpnp/backend/extensions/vm/atan2.cpp b/dpnp/backend/extensions/vm/atan2.cpp index 1d4e5c333e68..bf29e2921a1d 100644 --- a/dpnp/backend/extensions/vm/atan2.cpp +++ b/dpnp/backend/extensions/vm/atan2.cpp @@ -35,7 +35,10 @@ #include #include -#include "dpctl4pybind11.hpp" +#include +#include + +#include "dpnp4pybind11.hpp" #include "atan2.hpp" #include "common.hpp" diff --git a/dpnp/backend/extensions/vm/atanh.cpp b/dpnp/backend/extensions/vm/atanh.cpp index 7097fabf602f..9daab09980e6 100644 --- a/dpnp/backend/extensions/vm/atanh.cpp +++ b/dpnp/backend/extensions/vm/atanh.cpp @@ -35,7 +35,10 @@ #include #include -#include "dpctl4pybind11.hpp" +#include +#include + +#include "dpnp4pybind11.hpp" #include "atanh.hpp" #include "common.hpp" diff --git a/dpnp/backend/extensions/vm/cbrt.cpp b/dpnp/backend/extensions/vm/cbrt.cpp index db3cdfcebd8d..34ff8dd913ac 100644 --- a/dpnp/backend/extensions/vm/cbrt.cpp +++ b/dpnp/backend/extensions/vm/cbrt.cpp @@ -34,7 +34,10 @@ #include #include -#include "dpctl4pybind11.hpp" +#include +#include + +#include "dpnp4pybind11.hpp" #include "cbrt.hpp" #include "common.hpp" diff --git a/dpnp/backend/extensions/vm/ceil.cpp b/dpnp/backend/extensions/vm/ceil.cpp index 6f5aeba16f99..e76a30d28317 100644 --- a/dpnp/backend/extensions/vm/ceil.cpp +++ b/dpnp/backend/extensions/vm/ceil.cpp @@ -34,7 +34,10 @@ #include #include -#include "dpctl4pybind11.hpp" +#include +#include + +#include "dpnp4pybind11.hpp" #include "ceil.hpp" #include "common.hpp" diff --git a/dpnp/backend/extensions/vm/common.hpp b/dpnp/backend/extensions/vm/common.hpp index 325aba7fafd2..5d2631d5b556 100644 --- a/dpnp/backend/extensions/vm/common.hpp +++ b/dpnp/backend/extensions/vm/common.hpp @@ -34,10 +34,10 @@ #include #include +#include #include -#include -#include +#include "dpnp4pybind11.hpp" // utils extension header #include "ext/common.hpp" diff --git a/dpnp/backend/extensions/vm/conj.cpp b/dpnp/backend/extensions/vm/conj.cpp index 36710104750a..f77020cf1d55 100644 --- a/dpnp/backend/extensions/vm/conj.cpp +++ b/dpnp/backend/extensions/vm/conj.cpp @@ -35,7 +35,10 @@ #include #include -#include "dpctl4pybind11.hpp" +#include +#include + +#include "dpnp4pybind11.hpp" #include "common.hpp" #include "conj.hpp" diff --git a/dpnp/backend/extensions/vm/copysign.cpp b/dpnp/backend/extensions/vm/copysign.cpp index cd90abf65a06..15c0fceec413 100644 --- a/dpnp/backend/extensions/vm/copysign.cpp +++ b/dpnp/backend/extensions/vm/copysign.cpp @@ -35,7 +35,10 @@ #include #include -#include "dpctl4pybind11.hpp" +#include +#include + +#include "dpnp4pybind11.hpp" #include "common.hpp" #include "copysign.hpp" diff --git a/dpnp/backend/extensions/vm/cos.cpp b/dpnp/backend/extensions/vm/cos.cpp index 76db72594763..7c9b0c35d6ca 100644 --- a/dpnp/backend/extensions/vm/cos.cpp +++ b/dpnp/backend/extensions/vm/cos.cpp @@ -35,7 +35,10 @@ #include #include -#include "dpctl4pybind11.hpp" +#include +#include + +#include "dpnp4pybind11.hpp" #include "common.hpp" #include "cos.hpp" diff --git a/dpnp/backend/extensions/vm/cosh.cpp b/dpnp/backend/extensions/vm/cosh.cpp index 464410b1accc..a95c7075ba61 100644 --- a/dpnp/backend/extensions/vm/cosh.cpp +++ b/dpnp/backend/extensions/vm/cosh.cpp @@ -35,7 +35,10 @@ #include #include -#include "dpctl4pybind11.hpp" +#include +#include + +#include "dpnp4pybind11.hpp" #include "common.hpp" #include "cosh.hpp" diff --git a/dpnp/backend/extensions/vm/div.cpp b/dpnp/backend/extensions/vm/div.cpp index ad96f9acf083..6e0cb4d0439f 100644 --- a/dpnp/backend/extensions/vm/div.cpp +++ b/dpnp/backend/extensions/vm/div.cpp @@ -36,7 +36,10 @@ #include #include -#include "dpctl4pybind11.hpp" +#include +#include + +#include "dpnp4pybind11.hpp" #include "common.hpp" #include "div.hpp" diff --git a/dpnp/backend/extensions/vm/erf_funcs.cpp b/dpnp/backend/extensions/vm/erf_funcs.cpp index 4e84403eb061..7be7f691edcf 100644 --- a/dpnp/backend/extensions/vm/erf_funcs.cpp +++ b/dpnp/backend/extensions/vm/erf_funcs.cpp @@ -34,7 +34,10 @@ #include #include -#include "dpctl4pybind11.hpp" +#include +#include + +#include "dpnp4pybind11.hpp" #include "common.hpp" #include "erf_funcs.hpp" diff --git a/dpnp/backend/extensions/vm/exp.cpp b/dpnp/backend/extensions/vm/exp.cpp index acd265d191f7..31f50f36171d 100644 --- a/dpnp/backend/extensions/vm/exp.cpp +++ b/dpnp/backend/extensions/vm/exp.cpp @@ -35,7 +35,10 @@ #include #include -#include "dpctl4pybind11.hpp" +#include +#include + +#include "dpnp4pybind11.hpp" #include "common.hpp" #include "exp.hpp" diff --git a/dpnp/backend/extensions/vm/exp2.cpp b/dpnp/backend/extensions/vm/exp2.cpp index 82c6c32fb6c5..41f18351fa7d 100644 --- a/dpnp/backend/extensions/vm/exp2.cpp +++ b/dpnp/backend/extensions/vm/exp2.cpp @@ -34,7 +34,10 @@ #include #include -#include "dpctl4pybind11.hpp" +#include +#include + +#include "dpnp4pybind11.hpp" #include "common.hpp" #include "exp2.hpp" diff --git a/dpnp/backend/extensions/vm/expm1.cpp b/dpnp/backend/extensions/vm/expm1.cpp index 93cef7b3272d..37440cab9b0c 100644 --- a/dpnp/backend/extensions/vm/expm1.cpp +++ b/dpnp/backend/extensions/vm/expm1.cpp @@ -34,7 +34,10 @@ #include #include -#include "dpctl4pybind11.hpp" +#include +#include + +#include "dpnp4pybind11.hpp" #include "common.hpp" #include "expm1.hpp" diff --git a/dpnp/backend/extensions/vm/floor.cpp b/dpnp/backend/extensions/vm/floor.cpp index fb1a86eda7bf..771d141e7f6a 100644 --- a/dpnp/backend/extensions/vm/floor.cpp +++ b/dpnp/backend/extensions/vm/floor.cpp @@ -34,7 +34,10 @@ #include #include -#include "dpctl4pybind11.hpp" +#include +#include + +#include "dpnp4pybind11.hpp" #include "common.hpp" #include "floor.hpp" diff --git a/dpnp/backend/extensions/vm/fmax.cpp b/dpnp/backend/extensions/vm/fmax.cpp index 32786a3e8fc2..d01b3ef3dc42 100644 --- a/dpnp/backend/extensions/vm/fmax.cpp +++ b/dpnp/backend/extensions/vm/fmax.cpp @@ -35,7 +35,10 @@ #include #include -#include "dpctl4pybind11.hpp" +#include +#include + +#include "dpnp4pybind11.hpp" #include "common.hpp" #include "fmax.hpp" diff --git a/dpnp/backend/extensions/vm/fmin.cpp b/dpnp/backend/extensions/vm/fmin.cpp index d923b8c7ddfb..6fbebba556f8 100644 --- a/dpnp/backend/extensions/vm/fmin.cpp +++ b/dpnp/backend/extensions/vm/fmin.cpp @@ -35,7 +35,10 @@ #include #include -#include "dpctl4pybind11.hpp" +#include +#include + +#include "dpnp4pybind11.hpp" #include "common.hpp" #include "fmin.hpp" diff --git a/dpnp/backend/extensions/vm/fmod.cpp b/dpnp/backend/extensions/vm/fmod.cpp index 6c8a4ac705e4..1330453d6f84 100644 --- a/dpnp/backend/extensions/vm/fmod.cpp +++ b/dpnp/backend/extensions/vm/fmod.cpp @@ -35,7 +35,10 @@ #include #include -#include "dpctl4pybind11.hpp" +#include +#include + +#include "dpnp4pybind11.hpp" #include "common.hpp" #include "fmod.hpp" diff --git a/dpnp/backend/extensions/vm/hypot.cpp b/dpnp/backend/extensions/vm/hypot.cpp index 92b7c78f8ad6..a9b3d3c12288 100644 --- a/dpnp/backend/extensions/vm/hypot.cpp +++ b/dpnp/backend/extensions/vm/hypot.cpp @@ -35,7 +35,10 @@ #include #include -#include "dpctl4pybind11.hpp" +#include +#include + +#include "dpnp4pybind11.hpp" #include "common.hpp" #include "hypot.hpp" diff --git a/dpnp/backend/extensions/vm/i0.cpp b/dpnp/backend/extensions/vm/i0.cpp index 5db3ef9d9669..50f692ebd958 100644 --- a/dpnp/backend/extensions/vm/i0.cpp +++ b/dpnp/backend/extensions/vm/i0.cpp @@ -34,7 +34,10 @@ #include #include -#include "dpctl4pybind11.hpp" +#include +#include + +#include "dpnp4pybind11.hpp" #include "common.hpp" #include "i0.hpp" diff --git a/dpnp/backend/extensions/vm/inv.cpp b/dpnp/backend/extensions/vm/inv.cpp index 1adeb1be23d0..eda08a6d0cd5 100644 --- a/dpnp/backend/extensions/vm/inv.cpp +++ b/dpnp/backend/extensions/vm/inv.cpp @@ -34,7 +34,10 @@ #include #include -#include "dpctl4pybind11.hpp" +#include +#include + +#include "dpnp4pybind11.hpp" #include "common.hpp" #include "inv.hpp" diff --git a/dpnp/backend/extensions/vm/ln.cpp b/dpnp/backend/extensions/vm/ln.cpp index e60a0545005b..a5365e4d5a8b 100644 --- a/dpnp/backend/extensions/vm/ln.cpp +++ b/dpnp/backend/extensions/vm/ln.cpp @@ -35,7 +35,10 @@ #include #include -#include "dpctl4pybind11.hpp" +#include +#include + +#include "dpnp4pybind11.hpp" #include "common.hpp" #include "ln.hpp" diff --git a/dpnp/backend/extensions/vm/log10.cpp b/dpnp/backend/extensions/vm/log10.cpp index d26ec57ab9ce..c04fb602f63d 100644 --- a/dpnp/backend/extensions/vm/log10.cpp +++ b/dpnp/backend/extensions/vm/log10.cpp @@ -35,7 +35,10 @@ #include #include -#include "dpctl4pybind11.hpp" +#include +#include + +#include "dpnp4pybind11.hpp" #include "common.hpp" #include "log10.hpp" diff --git a/dpnp/backend/extensions/vm/log1p.cpp b/dpnp/backend/extensions/vm/log1p.cpp index 861804f8f6e0..04416bf37185 100644 --- a/dpnp/backend/extensions/vm/log1p.cpp +++ b/dpnp/backend/extensions/vm/log1p.cpp @@ -34,7 +34,10 @@ #include #include -#include "dpctl4pybind11.hpp" +#include +#include + +#include "dpnp4pybind11.hpp" #include "common.hpp" #include "log1p.hpp" diff --git a/dpnp/backend/extensions/vm/log2.cpp b/dpnp/backend/extensions/vm/log2.cpp index e75e96c32fe9..752caa261977 100644 --- a/dpnp/backend/extensions/vm/log2.cpp +++ b/dpnp/backend/extensions/vm/log2.cpp @@ -34,7 +34,10 @@ #include #include -#include "dpctl4pybind11.hpp" +#include +#include + +#include "dpnp4pybind11.hpp" #include "common.hpp" #include "log2.hpp" diff --git a/dpnp/backend/extensions/vm/modf.cpp b/dpnp/backend/extensions/vm/modf.cpp index ef68c79d8b42..418e4e44f7f7 100644 --- a/dpnp/backend/extensions/vm/modf.cpp +++ b/dpnp/backend/extensions/vm/modf.cpp @@ -35,7 +35,10 @@ #include #include -#include "dpctl4pybind11.hpp" +#include +#include + +#include "dpnp4pybind11.hpp" #include "common.hpp" #include "modf.hpp" diff --git a/dpnp/backend/extensions/vm/mul.cpp b/dpnp/backend/extensions/vm/mul.cpp index 0c9cf7fb79cc..557cfb8882b3 100644 --- a/dpnp/backend/extensions/vm/mul.cpp +++ b/dpnp/backend/extensions/vm/mul.cpp @@ -36,7 +36,10 @@ #include #include -#include "dpctl4pybind11.hpp" +#include +#include + +#include "dpnp4pybind11.hpp" #include "common.hpp" #include "mul.hpp" diff --git a/dpnp/backend/extensions/vm/nextafter.cpp b/dpnp/backend/extensions/vm/nextafter.cpp index 59b205b3d62a..a8ff710bda77 100644 --- a/dpnp/backend/extensions/vm/nextafter.cpp +++ b/dpnp/backend/extensions/vm/nextafter.cpp @@ -35,7 +35,10 @@ #include #include -#include "dpctl4pybind11.hpp" +#include +#include + +#include "dpnp4pybind11.hpp" #include "common.hpp" #include "nextafter.hpp" diff --git a/dpnp/backend/extensions/vm/pow.cpp b/dpnp/backend/extensions/vm/pow.cpp index 5969a4862730..f0db87d1ef48 100644 --- a/dpnp/backend/extensions/vm/pow.cpp +++ b/dpnp/backend/extensions/vm/pow.cpp @@ -36,7 +36,10 @@ #include #include -#include "dpctl4pybind11.hpp" +#include +#include + +#include "dpnp4pybind11.hpp" #include "common.hpp" #include "pow.hpp" diff --git a/dpnp/backend/extensions/vm/rint.cpp b/dpnp/backend/extensions/vm/rint.cpp index 41cd20a944a0..86931f259a04 100644 --- a/dpnp/backend/extensions/vm/rint.cpp +++ b/dpnp/backend/extensions/vm/rint.cpp @@ -34,7 +34,10 @@ #include #include -#include "dpctl4pybind11.hpp" +#include +#include + +#include "dpnp4pybind11.hpp" #include "common.hpp" #include "rint.hpp" diff --git a/dpnp/backend/extensions/vm/sin.cpp b/dpnp/backend/extensions/vm/sin.cpp index 9263c3c4ffcf..7bb6ec321d2a 100644 --- a/dpnp/backend/extensions/vm/sin.cpp +++ b/dpnp/backend/extensions/vm/sin.cpp @@ -35,7 +35,10 @@ #include #include -#include "dpctl4pybind11.hpp" +#include +#include + +#include "dpnp4pybind11.hpp" #include "common.hpp" #include "sin.hpp" diff --git a/dpnp/backend/extensions/vm/sinh.cpp b/dpnp/backend/extensions/vm/sinh.cpp index a1bae13a5281..5c351afd3b82 100644 --- a/dpnp/backend/extensions/vm/sinh.cpp +++ b/dpnp/backend/extensions/vm/sinh.cpp @@ -35,7 +35,10 @@ #include #include -#include "dpctl4pybind11.hpp" +#include +#include + +#include "dpnp4pybind11.hpp" #include "common.hpp" #include "sinh.hpp" diff --git a/dpnp/backend/extensions/vm/sqr.cpp b/dpnp/backend/extensions/vm/sqr.cpp index 88c2e833b483..9d5cb8af5f2c 100644 --- a/dpnp/backend/extensions/vm/sqr.cpp +++ b/dpnp/backend/extensions/vm/sqr.cpp @@ -34,7 +34,10 @@ #include #include -#include "dpctl4pybind11.hpp" +#include +#include + +#include "dpnp4pybind11.hpp" #include "common.hpp" #include "sqr.hpp" diff --git a/dpnp/backend/extensions/vm/sqrt.cpp b/dpnp/backend/extensions/vm/sqrt.cpp index 98cf2eea9253..5ab3489c1288 100644 --- a/dpnp/backend/extensions/vm/sqrt.cpp +++ b/dpnp/backend/extensions/vm/sqrt.cpp @@ -35,7 +35,10 @@ #include #include -#include "dpctl4pybind11.hpp" +#include +#include + +#include "dpnp4pybind11.hpp" #include "common.hpp" #include "sqrt.hpp" diff --git a/dpnp/backend/extensions/vm/sub.cpp b/dpnp/backend/extensions/vm/sub.cpp index 5ee01f239c06..401588d4b65f 100644 --- a/dpnp/backend/extensions/vm/sub.cpp +++ b/dpnp/backend/extensions/vm/sub.cpp @@ -36,7 +36,10 @@ #include #include -#include "dpctl4pybind11.hpp" +#include +#include + +#include "dpnp4pybind11.hpp" #include "common.hpp" #include "sub.hpp" diff --git a/dpnp/backend/extensions/vm/tan.cpp b/dpnp/backend/extensions/vm/tan.cpp index 46555ebd0178..590320034934 100644 --- a/dpnp/backend/extensions/vm/tan.cpp +++ b/dpnp/backend/extensions/vm/tan.cpp @@ -35,7 +35,10 @@ #include #include -#include "dpctl4pybind11.hpp" +#include +#include + +#include "dpnp4pybind11.hpp" #include "common.hpp" #include "tan.hpp" diff --git a/dpnp/backend/extensions/vm/tanh.cpp b/dpnp/backend/extensions/vm/tanh.cpp index 04d2febfac1d..8febd94f2ec8 100644 --- a/dpnp/backend/extensions/vm/tanh.cpp +++ b/dpnp/backend/extensions/vm/tanh.cpp @@ -35,7 +35,10 @@ #include #include -#include "dpctl4pybind11.hpp" +#include +#include + +#include "dpnp4pybind11.hpp" #include "common.hpp" #include "tanh.hpp" diff --git a/dpnp/backend/extensions/vm/trunc.cpp b/dpnp/backend/extensions/vm/trunc.cpp index c23a9a8180fb..4ec788ccf949 100644 --- a/dpnp/backend/extensions/vm/trunc.cpp +++ b/dpnp/backend/extensions/vm/trunc.cpp @@ -34,7 +34,10 @@ #include #include -#include "dpctl4pybind11.hpp" +#include +#include + +#include "dpnp4pybind11.hpp" #include "common.hpp" #include "trunc.hpp" diff --git a/dpnp/backend/extensions/window/CMakeLists.txt b/dpnp/backend/extensions/window/CMakeLists.txt index 0cebfe79b2de..9dac2df9d0df 100644 --- a/dpnp/backend/extensions/window/CMakeLists.txt +++ b/dpnp/backend/extensions/window/CMakeLists.txt @@ -36,6 +36,9 @@ set(_module_src pybind11_add_module(${python_module_name} MODULE ${_module_src}) add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_module_src}) +# Ensure Cython modules build first so _usmarray.h exists +add_dependencies(${python_module_name} _usmarray) + if(_dpnp_sycl_targets) # make fat binary target_compile_options( @@ -62,14 +65,21 @@ set_target_properties( target_include_directories( ${python_module_name} - PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../../ ${CMAKE_CURRENT_SOURCE_DIR}/../common + PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR}/../../ + ${CMAKE_CURRENT_SOURCE_DIR}/../common + ${CMAKE_SOURCE_DIR}/dpnp/backend/include + ${CMAKE_SOURCE_DIR}/dpnp/tensor/libtensor/include ) # treat below headers as system to suppress the warnings there during the build target_include_directories( ${python_module_name} SYSTEM - PRIVATE ${SYCL_INCLUDE_DIR} ${Dpctl_INCLUDE_DIRS} ${Dpctl_TENSOR_INCLUDE_DIR} + PRIVATE + ${SYCL_INCLUDE_DIR} + ${Dpctl_INCLUDE_DIRS} + ${CMAKE_BINARY_DIR} # For generated Cython headers ) if(WIN32) diff --git a/dpnp/backend/extensions/window/common.hpp b/dpnp/backend/extensions/window/common.hpp index 9e7b1192e3a2..fcec281b3948 100644 --- a/dpnp/backend/extensions/window/common.hpp +++ b/dpnp/backend/extensions/window/common.hpp @@ -37,7 +37,7 @@ #include -#include "dpctl4pybind11.hpp" +#include "dpnp4pybind11.hpp" #include #include diff --git a/dpnp/backend/extensions/window/kaiser.hpp b/dpnp/backend/extensions/window/kaiser.hpp index 4ba506620db2..9a088e700a2f 100644 --- a/dpnp/backend/extensions/window/kaiser.hpp +++ b/dpnp/backend/extensions/window/kaiser.hpp @@ -30,7 +30,7 @@ #include -#include +#include #include namespace dpnp::extensions::window diff --git a/dpnp/backend/include/dpnp4pybind11.hpp b/dpnp/backend/include/dpnp4pybind11.hpp new file mode 100644 index 000000000000..8bc931a3ca1a --- /dev/null +++ b/dpnp/backend/include/dpnp4pybind11.hpp @@ -0,0 +1,1328 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** + +#pragma once + +// Include dpctl C-API headers +#include "dpctl_capi.h" + +// Include generated Cython headers for usm_ndarray +// (struct definition and constants only) +#include "dpnp/tensor/_usmarray.h" +#include "dpnp/tensor/_usmarray_api.h" + +#include +#include +#include +#include // for std::size_t for C++ linkage +#include +#include +#include // for size_t for C linkage +#include +#include +#include +#include + +#include + +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace detail +{ +// Lookup a type according to its size, and return a value corresponding to the +// NumPy typenum. +template +constexpr int platform_typeid_lookup() +{ + return -1; +} + +template +constexpr int platform_typeid_lookup(int I, Ints... Is) +{ + return sizeof(Concrete) == sizeof(T) + ? I + : platform_typeid_lookup(Is...); +} + +class dpctl_capi +{ +public: + // dpctl type objects + PyTypeObject *Py_SyclDeviceType_; + PyTypeObject *PySyclDeviceType_; + PyTypeObject *Py_SyclContextType_; + PyTypeObject *PySyclContextType_; + PyTypeObject *Py_SyclEventType_; + PyTypeObject *PySyclEventType_; + PyTypeObject *Py_SyclQueueType_; + PyTypeObject *PySyclQueueType_; + PyTypeObject *Py_MemoryType_; + PyTypeObject *PyMemoryUSMDeviceType_; + PyTypeObject *PyMemoryUSMSharedType_; + PyTypeObject *PyMemoryUSMHostType_; + PyTypeObject *PyUSMArrayType_; + PyTypeObject *PySyclProgramType_; + PyTypeObject *PySyclKernelType_; + + DPCTLSyclDeviceRef (*SyclDevice_GetDeviceRef_)(PySyclDeviceObject *); + PySyclDeviceObject *(*SyclDevice_Make_)(DPCTLSyclDeviceRef); + + DPCTLSyclContextRef (*SyclContext_GetContextRef_)(PySyclContextObject *); + PySyclContextObject *(*SyclContext_Make_)(DPCTLSyclContextRef); + + DPCTLSyclEventRef (*SyclEvent_GetEventRef_)(PySyclEventObject *); + PySyclEventObject *(*SyclEvent_Make_)(DPCTLSyclEventRef); + + DPCTLSyclQueueRef (*SyclQueue_GetQueueRef_)(PySyclQueueObject *); + PySyclQueueObject *(*SyclQueue_Make_)(DPCTLSyclQueueRef); + + // memory + DPCTLSyclUSMRef (*Memory_GetUsmPointer_)(Py_MemoryObject *); + void *(*Memory_GetOpaquePointer_)(Py_MemoryObject *); + DPCTLSyclContextRef (*Memory_GetContextRef_)(Py_MemoryObject *); + DPCTLSyclQueueRef (*Memory_GetQueueRef_)(Py_MemoryObject *); + size_t (*Memory_GetNumBytes_)(Py_MemoryObject *); + PyObject *(*Memory_Make_)(DPCTLSyclUSMRef, + size_t, + DPCTLSyclQueueRef, + PyObject *); + + // program + DPCTLSyclKernelRef (*SyclKernel_GetKernelRef_)(PySyclKernelObject *); + PySyclKernelObject *(*SyclKernel_Make_)(DPCTLSyclKernelRef, const char *); + + DPCTLSyclKernelBundleRef (*SyclProgram_GetKernelBundleRef_)( + PySyclProgramObject *); + PySyclProgramObject *(*SyclProgram_Make_)(DPCTLSyclKernelBundleRef); + + int USM_ARRAY_C_CONTIGUOUS_; + int USM_ARRAY_F_CONTIGUOUS_; + int USM_ARRAY_WRITABLE_; + int UAR_BOOL_, UAR_BYTE_, UAR_UBYTE_, UAR_SHORT_, UAR_USHORT_, UAR_INT_, + UAR_UINT_, UAR_LONG_, UAR_ULONG_, UAR_LONGLONG_, UAR_ULONGLONG_, + UAR_FLOAT_, UAR_DOUBLE_, UAR_CFLOAT_, UAR_CDOUBLE_, UAR_TYPE_SENTINEL_, + UAR_HALF_; + int UAR_INT8_, UAR_UINT8_, UAR_INT16_, UAR_UINT16_, UAR_INT32_, UAR_UINT32_, + UAR_INT64_, UAR_UINT64_; + + bool PySyclDevice_Check_(PyObject *obj) const + { + return PyObject_TypeCheck(obj, PySyclDeviceType_) != 0; + } + bool PySyclContext_Check_(PyObject *obj) const + { + return PyObject_TypeCheck(obj, PySyclContextType_) != 0; + } + bool PySyclEvent_Check_(PyObject *obj) const + { + return PyObject_TypeCheck(obj, PySyclEventType_) != 0; + } + bool PySyclQueue_Check_(PyObject *obj) const + { + return PyObject_TypeCheck(obj, PySyclQueueType_) != 0; + } + bool PySyclKernel_Check_(PyObject *obj) const + { + return PyObject_TypeCheck(obj, PySyclKernelType_) != 0; + } + bool PySyclProgram_Check_(PyObject *obj) const + { + return PyObject_TypeCheck(obj, PySyclProgramType_) != 0; + } + + ~dpctl_capi() + { + as_usm_memory_.reset(); + default_usm_ndarray_.reset(); + default_usm_memory_.reset(); + default_sycl_queue_.reset(); + }; + + static auto &get() + { + static dpctl_capi api{}; + return api; + } + + py::object default_sycl_queue_pyobj() { return *default_sycl_queue_; } + py::object default_usm_memory_pyobj() { return *default_usm_memory_; } + py::object default_usm_ndarray_pyobj() { return *default_usm_ndarray_; } + py::object as_usm_memory_pyobj() { return *as_usm_memory_; } + +private: + struct Deleter + { + void operator()(py::object *p) const + { + const bool initialized = Py_IsInitialized(); +#if PY_VERSION_HEX < 0x30d0000 + const bool finalizing = _Py_IsFinalizing(); +#else + const bool finalizing = Py_IsFinalizing(); +#endif + const bool guard = initialized && !finalizing; + + if (guard) { + delete p; + } + } + }; + + std::shared_ptr default_sycl_queue_; + std::shared_ptr default_usm_memory_; + std::shared_ptr default_usm_ndarray_; + std::shared_ptr as_usm_memory_; + + dpctl_capi() + : Py_SyclDeviceType_(nullptr), PySyclDeviceType_(nullptr), + Py_SyclContextType_(nullptr), PySyclContextType_(nullptr), + Py_SyclEventType_(nullptr), PySyclEventType_(nullptr), + Py_SyclQueueType_(nullptr), PySyclQueueType_(nullptr), + Py_MemoryType_(nullptr), PyMemoryUSMDeviceType_(nullptr), + PyMemoryUSMSharedType_(nullptr), PyMemoryUSMHostType_(nullptr), + PyUSMArrayType_(nullptr), PySyclProgramType_(nullptr), + PySyclKernelType_(nullptr), SyclDevice_GetDeviceRef_(nullptr), + SyclDevice_Make_(nullptr), SyclContext_GetContextRef_(nullptr), + SyclContext_Make_(nullptr), SyclEvent_GetEventRef_(nullptr), + SyclEvent_Make_(nullptr), SyclQueue_GetQueueRef_(nullptr), + SyclQueue_Make_(nullptr), Memory_GetUsmPointer_(nullptr), + Memory_GetOpaquePointer_(nullptr), Memory_GetContextRef_(nullptr), + Memory_GetQueueRef_(nullptr), Memory_GetNumBytes_(nullptr), + Memory_Make_(nullptr), SyclKernel_GetKernelRef_(nullptr), + SyclKernel_Make_(nullptr), SyclProgram_GetKernelBundleRef_(nullptr), + SyclProgram_Make_(nullptr), USM_ARRAY_C_CONTIGUOUS_(0), + USM_ARRAY_F_CONTIGUOUS_(0), USM_ARRAY_WRITABLE_(0), UAR_BOOL_(-1), + UAR_BYTE_(-1), UAR_UBYTE_(-1), UAR_SHORT_(-1), UAR_USHORT_(-1), + UAR_INT_(-1), UAR_UINT_(-1), UAR_LONG_(-1), UAR_ULONG_(-1), + UAR_LONGLONG_(-1), UAR_ULONGLONG_(-1), UAR_FLOAT_(-1), + UAR_DOUBLE_(-1), UAR_CFLOAT_(-1), UAR_CDOUBLE_(-1), + UAR_TYPE_SENTINEL_(-1), UAR_HALF_(-1), UAR_INT8_(-1), UAR_UINT8_(-1), + UAR_INT16_(-1), UAR_UINT16_(-1), UAR_INT32_(-1), UAR_UINT32_(-1), + UAR_INT64_(-1), UAR_UINT64_(-1), default_sycl_queue_{}, + default_usm_memory_{}, default_usm_ndarray_{}, as_usm_memory_{} + + { + // Import dpctl C-API + // (device, context, event, queue, memory, program) + import_dpctl(); + // Import dpnp tensor module for PyUSMArrayType + import_dpnp__tensor___usmarray(); + + // Python type objects for classes implemented by dpctl + this->Py_SyclDeviceType_ = &Py_SyclDeviceType; + this->PySyclDeviceType_ = &PySyclDeviceType; + this->Py_SyclContextType_ = &Py_SyclContextType; + this->PySyclContextType_ = &PySyclContextType; + this->Py_SyclEventType_ = &Py_SyclEventType; + this->PySyclEventType_ = &PySyclEventType; + this->Py_SyclQueueType_ = &Py_SyclQueueType; + this->PySyclQueueType_ = &PySyclQueueType; + this->Py_MemoryType_ = &Py_MemoryType; + this->PyMemoryUSMDeviceType_ = &PyMemoryUSMDeviceType; + this->PyMemoryUSMSharedType_ = &PyMemoryUSMSharedType; + this->PyMemoryUSMHostType_ = &PyMemoryUSMHostType; + this->PyUSMArrayType_ = &PyUSMArrayType; + this->PySyclProgramType_ = &PySyclProgramType; + this->PySyclKernelType_ = &PySyclKernelType; + + // SyclDevice API + this->SyclDevice_GetDeviceRef_ = SyclDevice_GetDeviceRef; + this->SyclDevice_Make_ = SyclDevice_Make; + + // SyclContext API + this->SyclContext_GetContextRef_ = SyclContext_GetContextRef; + this->SyclContext_Make_ = SyclContext_Make; + + // SyclEvent API + this->SyclEvent_GetEventRef_ = SyclEvent_GetEventRef; + this->SyclEvent_Make_ = SyclEvent_Make; + + // SyclQueue API + this->SyclQueue_GetQueueRef_ = SyclQueue_GetQueueRef; + this->SyclQueue_Make_ = SyclQueue_Make; + + // dpctl.memory API + this->Memory_GetUsmPointer_ = Memory_GetUsmPointer; + this->Memory_GetOpaquePointer_ = Memory_GetOpaquePointer; + this->Memory_GetContextRef_ = Memory_GetContextRef; + this->Memory_GetQueueRef_ = Memory_GetQueueRef; + this->Memory_GetNumBytes_ = Memory_GetNumBytes; + this->Memory_Make_ = Memory_Make; + + // dpctl.program API + this->SyclKernel_GetKernelRef_ = SyclKernel_GetKernelRef; + this->SyclKernel_Make_ = SyclKernel_Make; + this->SyclProgram_GetKernelBundleRef_ = SyclProgram_GetKernelBundleRef; + this->SyclProgram_Make_ = SyclProgram_Make; + + // constants + this->USM_ARRAY_C_CONTIGUOUS_ = USM_ARRAY_C_CONTIGUOUS; + this->USM_ARRAY_F_CONTIGUOUS_ = USM_ARRAY_F_CONTIGUOUS; + this->USM_ARRAY_WRITABLE_ = USM_ARRAY_WRITABLE; + this->UAR_BOOL_ = UAR_BOOL; + this->UAR_BYTE_ = UAR_BYTE; + this->UAR_UBYTE_ = UAR_UBYTE; + this->UAR_SHORT_ = UAR_SHORT; + this->UAR_USHORT_ = UAR_USHORT; + this->UAR_INT_ = UAR_INT; + this->UAR_UINT_ = UAR_UINT; + this->UAR_LONG_ = UAR_LONG; + this->UAR_ULONG_ = UAR_ULONG; + this->UAR_LONGLONG_ = UAR_LONGLONG; + this->UAR_ULONGLONG_ = UAR_ULONGLONG; + this->UAR_FLOAT_ = UAR_FLOAT; + this->UAR_DOUBLE_ = UAR_DOUBLE; + this->UAR_CFLOAT_ = UAR_CFLOAT; + this->UAR_CDOUBLE_ = UAR_CDOUBLE; + this->UAR_TYPE_SENTINEL_ = UAR_TYPE_SENTINEL; + this->UAR_HALF_ = UAR_HALF; + + // deduced disjoint types + this->UAR_INT8_ = UAR_BYTE; + this->UAR_UINT8_ = UAR_UBYTE; + this->UAR_INT16_ = UAR_SHORT; + this->UAR_UINT16_ = UAR_USHORT; + this->UAR_INT32_ = + platform_typeid_lookup( + UAR_LONG, UAR_INT, UAR_SHORT); + this->UAR_UINT32_ = + platform_typeid_lookup(UAR_ULONG, UAR_UINT, + UAR_USHORT); + this->UAR_INT64_ = + platform_typeid_lookup( + UAR_LONG, UAR_LONGLONG, UAR_INT); + this->UAR_UINT64_ = + platform_typeid_lookup( + UAR_ULONG, UAR_ULONGLONG, UAR_UINT); + + // create shared pointers to python objects used in type-casters + // for dpctl::memory::usm_memory and dpctl::tensor::usm_ndarray + sycl::queue q_{}; + PySyclQueueObject *py_q_tmp = + SyclQueue_Make(reinterpret_cast(&q_)); + const py::object &py_sycl_queue = py::reinterpret_steal( + reinterpret_cast(py_q_tmp)); + + default_sycl_queue_ = std::shared_ptr( + new py::object(py_sycl_queue), Deleter{}); + + py::module_ mod_memory = py::module_::import("dpctl.memory"); + const py::object &py_as_usm_memory = mod_memory.attr("as_usm_memory"); + as_usm_memory_ = std::shared_ptr( + new py::object{py_as_usm_memory}, Deleter{}); + + auto mem_kl = mod_memory.attr("MemoryUSMHost"); + const py::object &py_default_usm_memory = + mem_kl(1, py::arg("queue") = py_sycl_queue); + default_usm_memory_ = std::shared_ptr( + new py::object{py_default_usm_memory}, Deleter{}); + + py::module_ mod_usmarray = py::module_::import("dpnp.tensor._usmarray"); + auto tensor_kl = mod_usmarray.attr("usm_ndarray"); + + const py::object &py_default_usm_ndarray = + tensor_kl(py::tuple(), py::arg("dtype") = py::str("u1"), + py::arg("buffer") = py_default_usm_memory); + + default_usm_ndarray_ = std::shared_ptr( + new py::object{py_default_usm_ndarray}, Deleter{}); + } + + dpctl_capi(dpctl_capi const &) = default; + dpctl_capi &operator=(dpctl_capi const &) = default; + dpctl_capi &operator=(dpctl_capi &&) = default; + +}; // struct dpctl_capi +} // namespace detail +} // namespace dpctl + +namespace pybind11::detail +{ +#define DPCTL_TYPE_CASTER(type, py_name) \ +protected: \ + std::unique_ptr value; \ + \ +public: \ + static constexpr auto name = py_name; \ + template < \ + typename T_, \ + ::pybind11::detail::enable_if_t< \ + std::is_same>::value, \ + int> = 0> \ + static ::pybind11::handle cast(T_ *src, \ + ::pybind11::return_value_policy policy, \ + ::pybind11::handle parent) \ + { \ + if (!src) \ + return ::pybind11::none().release(); \ + if (policy == ::pybind11::return_value_policy::take_ownership) { \ + auto h = cast(std::move(*src), policy, parent); \ + delete src; \ + return h; \ + } \ + return cast(*src, policy, parent); \ + } \ + operator type *() \ + { \ + return value.get(); \ + } /* NOLINT(bugprone-macro-parentheses) */ \ + operator type &() \ + { \ + return *value; \ + } /* NOLINT(bugprone-macro-parentheses) */ \ + operator type &&() && \ + { \ + return std::move(*value); \ + } /* NOLINT(bugprone-macro-parentheses) */ \ + template \ + using cast_op_type = ::pybind11::detail::movable_cast_op_type + +/* This type caster associates ``sycl::queue`` C++ class with + * :class:`dpctl.SyclQueue` for the purposes of generation of + * Python bindings by pybind11. + */ +template <> +struct type_caster +{ +public: + bool load(handle src, bool) + { + PyObject *source = src.ptr(); + auto const &api = ::dpctl::detail::dpctl_capi::get(); + if (api.PySyclQueue_Check_(source)) { + DPCTLSyclQueueRef QRef = api.SyclQueue_GetQueueRef_( + reinterpret_cast(source)); + value = std::make_unique( + *(reinterpret_cast(QRef))); + return true; + } + else { + throw py::type_error( + "Input is of unexpected type, expected dpctl.SyclQueue"); + } + } + + static handle cast(sycl::queue src, return_value_policy, handle) + { + auto const &api = ::dpctl::detail::dpctl_capi::get(); + auto tmp = + api.SyclQueue_Make_(reinterpret_cast(&src)); + return handle(reinterpret_cast(tmp)); + } + + DPCTL_TYPE_CASTER(sycl::queue, _("dpctl.SyclQueue")); +}; + +/* This type caster associates ``sycl::device`` C++ class with + * :class:`dpctl.SyclDevice` for the purposes of generation of + * Python bindings by pybind11. + */ +template <> +struct type_caster +{ +public: + bool load(handle src, bool) + { + PyObject *source = src.ptr(); + auto const &api = ::dpctl::detail::dpctl_capi::get(); + if (api.PySyclDevice_Check_(source)) { + DPCTLSyclDeviceRef DRef = api.SyclDevice_GetDeviceRef_( + reinterpret_cast(source)); + value = std::make_unique( + *(reinterpret_cast(DRef))); + return true; + } + else { + throw py::type_error( + "Input is of unexpected type, expected dpctl.SyclDevice"); + } + } + + static handle cast(sycl::device src, return_value_policy, handle) + { + auto const &api = ::dpctl::detail::dpctl_capi::get(); + auto tmp = + api.SyclDevice_Make_(reinterpret_cast(&src)); + return handle(reinterpret_cast(tmp)); + } + + DPCTL_TYPE_CASTER(sycl::device, _("dpctl.SyclDevice")); +}; + +/* This type caster associates ``sycl::context`` C++ class with + * :class:`dpctl.SyclContext` for the purposes of generation of + * Python bindings by pybind11. + */ +template <> +struct type_caster +{ +public: + bool load(handle src, bool) + { + PyObject *source = src.ptr(); + auto const &api = ::dpctl::detail::dpctl_capi::get(); + if (api.PySyclContext_Check_(source)) { + DPCTLSyclContextRef CRef = api.SyclContext_GetContextRef_( + reinterpret_cast(source)); + value = std::make_unique( + *(reinterpret_cast(CRef))); + return true; + } + else { + throw py::type_error( + "Input is of unexpected type, expected dpctl.SyclContext"); + } + } + + static handle cast(sycl::context src, return_value_policy, handle) + { + auto const &api = ::dpctl::detail::dpctl_capi::get(); + auto tmp = + api.SyclContext_Make_(reinterpret_cast(&src)); + return handle(reinterpret_cast(tmp)); + } + + DPCTL_TYPE_CASTER(sycl::context, _("dpctl.SyclContext")); +}; + +/* This type caster associates ``sycl::event`` C++ class with + * :class:`dpctl.SyclEvent` for the purposes of generation of + * Python bindings by pybind11. + */ +template <> +struct type_caster +{ +public: + bool load(handle src, bool) + { + PyObject *source = src.ptr(); + auto const &api = ::dpctl::detail::dpctl_capi::get(); + if (api.PySyclEvent_Check_(source)) { + DPCTLSyclEventRef ERef = api.SyclEvent_GetEventRef_( + reinterpret_cast(source)); + value = std::make_unique( + *(reinterpret_cast(ERef))); + return true; + } + else { + throw py::type_error( + "Input is of unexpected type, expected dpctl.SyclEvent"); + } + } + + static handle cast(sycl::event src, return_value_policy, handle) + { + auto const &api = ::dpctl::detail::dpctl_capi::get(); + auto tmp = + api.SyclEvent_Make_(reinterpret_cast(&src)); + return handle(reinterpret_cast(tmp)); + } + + DPCTL_TYPE_CASTER(sycl::event, _("dpctl.SyclEvent")); +}; + +/* This type caster associates ``sycl::kernel`` C++ class with + * :class:`dpctl.program.SyclKernel` for the purposes of generation of + * Python bindings by pybind11. + */ +template <> +struct type_caster +{ +public: + bool load(handle src, bool) + { + PyObject *source = src.ptr(); + auto const &api = ::dpctl::detail::dpctl_capi::get(); + if (api.PySyclKernel_Check_(source)) { + DPCTLSyclKernelRef KRef = api.SyclKernel_GetKernelRef_( + reinterpret_cast(source)); + value = std::make_unique( + *(reinterpret_cast(KRef))); + return true; + } + else { + throw py::type_error("Input is of unexpected type, expected " + "dpctl.program.SyclKernel"); + } + } + + static handle cast(sycl::kernel src, return_value_policy, handle) + { + auto const &api = ::dpctl::detail::dpctl_capi::get(); + auto tmp = + api.SyclKernel_Make_(reinterpret_cast(&src), + "dpctl4pybind11_kernel"); + return handle(reinterpret_cast(tmp)); + } + + DPCTL_TYPE_CASTER(sycl::kernel, _("dpctl.program.SyclKernel")); +}; + +/* This type caster associates + * ``sycl::kernel_bundle`` C++ class with + * :class:`dpctl.program.SyclProgram` for the purposes of generation of + * Python bindings by pybind11. + */ +template <> +struct type_caster> +{ +public: + bool load(handle src, bool) + { + PyObject *source = src.ptr(); + auto const &api = ::dpctl::detail::dpctl_capi::get(); + if (api.PySyclProgram_Check_(source)) { + DPCTLSyclKernelBundleRef KBRef = + api.SyclProgram_GetKernelBundleRef_( + reinterpret_cast(source)); + value = std::make_unique< + sycl::kernel_bundle>( + *(reinterpret_cast< + sycl::kernel_bundle *>( + KBRef))); + return true; + } + else { + throw py::type_error("Input is of unexpected type, expected " + "dpctl.program.SyclProgram"); + } + } + + static handle cast(sycl::kernel_bundle src, + return_value_policy, + handle) + { + auto const &api = ::dpctl::detail::dpctl_capi::get(); + auto tmp = api.SyclProgram_Make_( + reinterpret_cast(&src)); + return handle(reinterpret_cast(tmp)); + } + + DPCTL_TYPE_CASTER(sycl::kernel_bundle, + _("dpctl.program.SyclProgram")); +}; + +/* This type caster associates + * ``sycl::half`` C++ class with Python :class:`float` for the purposes + * of generation of Python bindings by pybind11. + */ +template <> +struct type_caster +{ +public: + bool load(handle src, bool convert) + { + double py_value; + + if (!src) { + return false; + } + + PyObject *source = src.ptr(); + + if (convert || PyFloat_Check(source)) { + py_value = PyFloat_AsDouble(source); + } + else { + return false; + } + + bool py_err = (py_value == double(-1)) && PyErr_Occurred(); + + if (py_err) { + PyErr_Clear(); + if (convert && (PyNumber_Check(source) != 0)) { + auto tmp = reinterpret_steal(PyNumber_Float(source)); + return load(tmp, false); + } + return false; + } + value = static_cast(py_value); + return true; + } + + static handle cast(sycl::half src, return_value_policy, handle) + { + return PyFloat_FromDouble(static_cast(src)); + } + + PYBIND11_TYPE_CASTER(sycl::half, _("float")); +}; +} // namespace pybind11::detail + +namespace dpctl +{ +namespace memory +{ +// since PYBIND11_OBJECT_CVT uses error_already_set without namespace, +// this allows to avoid compilation error +using pybind11::error_already_set; + +class usm_memory : public py::object +{ +public: + PYBIND11_OBJECT_CVT( + usm_memory, + py::object, + [](PyObject *o) -> bool { + return PyObject_TypeCheck( + o, ::dpctl::detail::dpctl_capi::get().Py_MemoryType_) != + 0; + }, + [](PyObject *o) -> PyObject * { return as_usm_memory(o); }) + + usm_memory() + : py::object( + ::dpctl::detail::dpctl_capi::get().default_usm_memory_pyobj(), + borrowed_t{}) + { + if (!m_ptr) + throw py::error_already_set(); + } + + /*! @brief Create usm_memory object from shared pointer that manages + * lifetime of the USM allocation. + */ + usm_memory(void *usm_ptr, + std::size_t nbytes, + const sycl::queue &q, + std::shared_ptr shptr) + { + auto const &api = ::dpctl::detail::dpctl_capi::get(); + DPCTLSyclUSMRef usm_ref = reinterpret_cast(usm_ptr); + auto q_uptr = std::make_unique(q); + DPCTLSyclQueueRef QRef = + reinterpret_cast(q_uptr.get()); + + auto vacuous_destructor = []() {}; + py::capsule mock_owner(vacuous_destructor); + + // create memory object owned by mock_owner, it is a new reference + PyObject *_memory = + api.Memory_Make_(usm_ref, nbytes, QRef, mock_owner.ptr()); + auto ref_count_decrementer = [](PyObject *o) noexcept { Py_DECREF(o); }; + + using py_uptrT = + std::unique_ptr; + + if (!_memory) { + throw py::error_already_set(); + } + + auto memory_uptr = py_uptrT(_memory, ref_count_decrementer); + std::shared_ptr *opaque_ptr = new std::shared_ptr(shptr); + + Py_MemoryObject *memobj = reinterpret_cast(_memory); + // replace mock_owner capsule as the owner + memobj->refobj = Py_None; + // set opaque ptr field, usm_memory now knowns that USM is managed + // by smart pointer + memobj->_opaque_ptr = reinterpret_cast(opaque_ptr); + + // _memory will delete created copies of sycl::queue, and + // std::shared_ptr and the deleter of the shared_ptr is + // supposed to free the USM allocation + m_ptr = _memory; + q_uptr.release(); + memory_uptr.release(); + } + + sycl::queue get_queue() const + { + Py_MemoryObject *mem_obj = reinterpret_cast(m_ptr); + auto const &api = ::dpctl::detail::dpctl_capi::get(); + DPCTLSyclQueueRef QRef = api.Memory_GetQueueRef_(mem_obj); + sycl::queue *obj_q = reinterpret_cast(QRef); + return *obj_q; + } + + char *get_pointer() const + { + Py_MemoryObject *mem_obj = reinterpret_cast(m_ptr); + auto const &api = ::dpctl::detail::dpctl_capi::get(); + DPCTLSyclUSMRef MRef = api.Memory_GetUsmPointer_(mem_obj); + return reinterpret_cast(MRef); + } + + std::size_t get_nbytes() const + { + auto const &api = ::dpctl::detail::dpctl_capi::get(); + Py_MemoryObject *mem_obj = reinterpret_cast(m_ptr); + return api.Memory_GetNumBytes_(mem_obj); + } + + bool is_managed_by_smart_ptr() const + { + auto const &api = ::dpctl::detail::dpctl_capi::get(); + Py_MemoryObject *mem_obj = reinterpret_cast(m_ptr); + const void *opaque_ptr = api.Memory_GetOpaquePointer_(mem_obj); + + return bool(opaque_ptr); + } + + const std::shared_ptr &get_smart_ptr_owner() const + { + auto const &api = ::dpctl::detail::dpctl_capi::get(); + Py_MemoryObject *mem_obj = reinterpret_cast(m_ptr); + void *opaque_ptr = api.Memory_GetOpaquePointer_(mem_obj); + + if (opaque_ptr) { + auto shptr_ptr = + reinterpret_cast *>(opaque_ptr); + return *shptr_ptr; + } + else { + throw std::runtime_error( + "Memory object does not have smart pointer " + "managing lifetime of USM allocation"); + } + } + +protected: + static PyObject *as_usm_memory(PyObject *o) + { + if (o == nullptr) { + PyErr_SetString(PyExc_ValueError, + "cannot create a usm_memory from a nullptr"); + return nullptr; + } + + auto converter = + ::dpctl::detail::dpctl_capi::get().as_usm_memory_pyobj(); + + py::object res; + try { + res = converter(py::handle(o)); + } catch (const py::error_already_set &e) { + return nullptr; + } + return res.ptr(); + } +}; +} // end namespace memory + +namespace tensor +{ +inline std::vector + c_contiguous_strides(int nd, + const py::ssize_t *shape, + py::ssize_t element_size = 1) +{ + if (nd > 0) { + std::vector c_strides(nd, element_size); + for (int ic = nd - 1; ic > 0;) { + py::ssize_t next_v = c_strides[ic] * shape[ic]; + c_strides[--ic] = next_v; + } + return c_strides; + } + else { + return std::vector(); + } +} + +inline std::vector + f_contiguous_strides(int nd, + const py::ssize_t *shape, + py::ssize_t element_size = 1) +{ + if (nd > 0) { + std::vector f_strides(nd, element_size); + for (int i = 0; i < nd - 1;) { + py::ssize_t next_v = f_strides[i] * shape[i]; + f_strides[++i] = next_v; + } + return f_strides; + } + else { + return std::vector(); + } +} + +inline std::vector + c_contiguous_strides(const std::vector &shape, + py::ssize_t element_size = 1) +{ + return c_contiguous_strides(shape.size(), shape.data(), element_size); +} + +inline std::vector + f_contiguous_strides(const std::vector &shape, + py::ssize_t element_size = 1) +{ + return f_contiguous_strides(shape.size(), shape.data(), element_size); +} + +class usm_ndarray : public py::object +{ +public: + PYBIND11_OBJECT(usm_ndarray, py::object, [](PyObject *o) -> bool { + return PyObject_TypeCheck( + o, ::dpctl::detail::dpctl_capi::get().PyUSMArrayType_) != 0; + }) + + usm_ndarray() + : py::object( + ::dpctl::detail::dpctl_capi::get().default_usm_ndarray_pyobj(), + borrowed_t{}) + { + if (!m_ptr) + throw py::error_already_set(); + } + + char *get_data() const + { + PyUSMArrayObject *raw_ar = usm_array_ptr(); + return raw_ar->data_; + } + + template + T *get_data() const + { + return reinterpret_cast(get_data()); + } + + int get_ndim() const + { + PyUSMArrayObject *raw_ar = usm_array_ptr(); + return raw_ar->nd_; + } + + const py::ssize_t *get_shape_raw() const + { + PyUSMArrayObject *raw_ar = usm_array_ptr(); + return raw_ar->shape_; + } + + std::vector get_shape_vector() const + { + auto raw_sh = get_shape_raw(); + auto nd = get_ndim(); + + std::vector shape_vector(raw_sh, raw_sh + nd); + return shape_vector; + } + + py::ssize_t get_shape(int i) const + { + auto shape_ptr = get_shape_raw(); + return shape_ptr[i]; + } + + const py::ssize_t *get_strides_raw() const + { + PyUSMArrayObject *raw_ar = usm_array_ptr(); + return raw_ar->strides_; + } + + std::vector get_strides_vector() const + { + auto raw_st = get_strides_raw(); + auto nd = get_ndim(); + + if (raw_st == nullptr) { + auto is_c_contig = is_c_contiguous(); + auto is_f_contig = is_f_contiguous(); + auto raw_sh = get_shape_raw(); + if (is_c_contig) { + const auto &contig_strides = c_contiguous_strides(nd, raw_sh); + return contig_strides; + } + else if (is_f_contig) { + const auto &contig_strides = f_contiguous_strides(nd, raw_sh); + return contig_strides; + } + else { + throw std::runtime_error("Invalid array encountered when " + "building strides"); + } + } + else { + std::vector st_vec(raw_st, raw_st + nd); + return st_vec; + } + } + + py::ssize_t get_size() const + { + PyUSMArrayObject *raw_ar = usm_array_ptr(); + + int ndim = raw_ar->nd_; + const py::ssize_t *shape = raw_ar->shape_; + + py::ssize_t nelems = 1; + for (int i = 0; i < ndim; ++i) { + nelems *= shape[i]; + } + + assert(nelems >= 0); + return nelems; + } + + std::pair get_minmax_offsets() const + { + PyUSMArrayObject *raw_ar = usm_array_ptr(); + + int nd = raw_ar->nd_; + const py::ssize_t *shape = raw_ar->shape_; + const py::ssize_t *strides = raw_ar->strides_; + + py::ssize_t offset_min = 0; + py::ssize_t offset_max = 0; + if (strides == nullptr) { + py::ssize_t stride(1); + for (int i = 0; i < nd; ++i) { + offset_max += stride * (shape[i] - 1); + stride *= shape[i]; + } + } + else { + for (int i = 0; i < nd; ++i) { + py::ssize_t delta = strides[i] * (shape[i] - 1); + if (strides[i] > 0) { + offset_max += delta; + } + else { + offset_min += delta; + } + } + } + return std::make_pair(offset_min, offset_max); + } + + sycl::queue get_queue() const + { + PyUSMArrayObject *raw_ar = usm_array_ptr(); + Py_MemoryObject *mem_obj = + reinterpret_cast(raw_ar->base_); + + auto const &api = ::dpctl::detail::dpctl_capi::get(); + DPCTLSyclQueueRef QRef = api.Memory_GetQueueRef_(mem_obj); + return *(reinterpret_cast(QRef)); + } + + sycl::device get_device() const + { + PyUSMArrayObject *raw_ar = usm_array_ptr(); + Py_MemoryObject *mem_obj = + reinterpret_cast(raw_ar->base_); + + auto const &api = ::dpctl::detail::dpctl_capi::get(); + DPCTLSyclQueueRef QRef = api.Memory_GetQueueRef_(mem_obj); + return reinterpret_cast(QRef)->get_device(); + } + + int get_typenum() const + { + PyUSMArrayObject *raw_ar = usm_array_ptr(); + return raw_ar->typenum_; + } + + int get_flags() const + { + PyUSMArrayObject *raw_ar = usm_array_ptr(); + return raw_ar->flags_; + } + + int get_elemsize() const + { + int typenum = get_typenum(); + auto const &api = ::dpctl::detail::dpctl_capi::get(); + + // Lookup table for element sizes based on typenum + if (typenum == api.UAR_BOOL_) + return 1; + if (typenum == api.UAR_BYTE_) + return 1; + if (typenum == api.UAR_UBYTE_) + return 1; + if (typenum == api.UAR_SHORT_) + return 2; + if (typenum == api.UAR_USHORT_) + return 2; + if (typenum == api.UAR_INT_) + return 4; + if (typenum == api.UAR_UINT_) + return 4; + if (typenum == api.UAR_LONG_) + return sizeof(long); + if (typenum == api.UAR_ULONG_) + return sizeof(unsigned long); + if (typenum == api.UAR_LONGLONG_) + return 8; + if (typenum == api.UAR_ULONGLONG_) + return 8; + if (typenum == api.UAR_FLOAT_) + return 4; + if (typenum == api.UAR_DOUBLE_) + return 8; + if (typenum == api.UAR_CFLOAT_) + return 8; + if (typenum == api.UAR_CDOUBLE_) + return 16; + if (typenum == api.UAR_HALF_) + return 2; + + return 0; // Unknown type + } + + bool is_c_contiguous() const + { + int flags = get_flags(); + auto const &api = ::dpctl::detail::dpctl_capi::get(); + return static_cast(flags & api.USM_ARRAY_C_CONTIGUOUS_); + } + + bool is_f_contiguous() const + { + int flags = get_flags(); + auto const &api = ::dpctl::detail::dpctl_capi::get(); + return static_cast(flags & api.USM_ARRAY_F_CONTIGUOUS_); + } + + bool is_writable() const + { + int flags = get_flags(); + auto const &api = ::dpctl::detail::dpctl_capi::get(); + return static_cast(flags & api.USM_ARRAY_WRITABLE_); + } + + /*! @brief Get usm_data property of array */ + py::object get_usm_data() const + { + PyUSMArrayObject *raw_ar = usm_array_ptr(); + // base_ is the Memory object - return new reference + PyObject *usm_data = raw_ar->base_; + Py_XINCREF(usm_data); + + // pass reference ownership to py::object + return py::reinterpret_steal(usm_data); + } + + bool is_managed_by_smart_ptr() const + { + PyUSMArrayObject *raw_ar = usm_array_ptr(); + PyObject *usm_data = raw_ar->base_; + + auto const &api = ::dpctl::detail::dpctl_capi::get(); + if (!PyObject_TypeCheck(usm_data, api.Py_MemoryType_)) { + return false; + } + + Py_MemoryObject *mem_obj = + reinterpret_cast(usm_data); + const void *opaque_ptr = api.Memory_GetOpaquePointer_(mem_obj); + + return bool(opaque_ptr); + } + + const std::shared_ptr &get_smart_ptr_owner() const + { + PyUSMArrayObject *raw_ar = usm_array_ptr(); + PyObject *usm_data = raw_ar->base_; + + auto const &api = ::dpctl::detail::dpctl_capi::get(); + + if (!PyObject_TypeCheck(usm_data, api.Py_MemoryType_)) { + throw std::runtime_error( + "usm_ndarray object does not have Memory object " + "managing lifetime of USM allocation"); + } + + Py_MemoryObject *mem_obj = + reinterpret_cast(usm_data); + void *opaque_ptr = api.Memory_GetOpaquePointer_(mem_obj); + + if (opaque_ptr) { + auto shptr_ptr = + reinterpret_cast *>(opaque_ptr); + return *shptr_ptr; + } + else { + throw std::runtime_error( + "Memory object underlying usm_ndarray does not have " + "smart pointer managing lifetime of USM allocation"); + } + } + +private: + PyUSMArrayObject *usm_array_ptr() const + { + return reinterpret_cast(m_ptr); + } +}; +} // end namespace tensor + +namespace utils +{ +namespace detail +{ +struct ManagedMemory +{ + + static bool is_usm_managed_by_shared_ptr(const py::object &h) + { + if (py::isinstance(h)) { + const auto &usm_memory_inst = + py::cast(h); + return usm_memory_inst.is_managed_by_smart_ptr(); + } + else if (py::isinstance(h)) { + const auto &usm_array_inst = + py::cast(h); + return usm_array_inst.is_managed_by_smart_ptr(); + } + + return false; + } + + static const std::shared_ptr &extract_shared_ptr(const py::object &h) + { + if (py::isinstance(h)) { + const auto &usm_memory_inst = + py::cast(h); + return usm_memory_inst.get_smart_ptr_owner(); + } + else if (py::isinstance(h)) { + const auto &usm_array_inst = + py::cast(h); + return usm_array_inst.get_smart_ptr_owner(); + } + + throw std::runtime_error( + "Attempted extraction of shared_ptr on an unrecognized type"); + } +}; +} // end of namespace detail + +template +sycl::event keep_args_alive(sycl::queue &q, + const py::object (&py_objs)[num], + const std::vector &depends = {}) +{ + std::size_t n_objects_held = 0; + std::array, num> shp_arr{}; + + std::size_t n_usm_owners_held = 0; + std::array, num> shp_usm{}; + + for (std::size_t i = 0; i < num; ++i) { + const auto &py_obj_i = py_objs[i]; + if (detail::ManagedMemory::is_usm_managed_by_shared_ptr(py_obj_i)) { + const auto &shp = + detail::ManagedMemory::extract_shared_ptr(py_obj_i); + shp_usm[n_usm_owners_held] = shp; + ++n_usm_owners_held; + } + else { + shp_arr[n_objects_held] = std::make_shared(py_obj_i); + shp_arr[n_objects_held]->inc_ref(); + ++n_objects_held; + } + } + + bool use_depends = true; + sycl::event host_task_ev; + + if (n_usm_owners_held > 0) { + host_task_ev = q.submit([&](sycl::handler &cgh) { + if (use_depends) { + cgh.depends_on(depends); + use_depends = false; + } + else { + cgh.depends_on(host_task_ev); + } + cgh.host_task([shp_usm = std::move(shp_usm)]() { + // no body, but shared pointers are captured in + // the lambda, ensuring that USM allocation is + // kept alive + }); + }); + } + + if (n_objects_held > 0) { + host_task_ev = q.submit([&](sycl::handler &cgh) { + if (use_depends) { + cgh.depends_on(depends); + use_depends = false; + } + else { + cgh.depends_on(host_task_ev); + } + cgh.host_task([n_objects_held, shp_arr = std::move(shp_arr)]() { + py::gil_scoped_acquire acquire; + + for (std::size_t i = 0; i < n_objects_held; ++i) { + shp_arr[i]->dec_ref(); + } + }); + }); + } + + return host_task_ev; +} + +/*! @brief Check if all allocation queues are the same as the + execution queue */ +template +bool queues_are_compatible(const sycl::queue &exec_q, + const sycl::queue (&alloc_qs)[num]) +{ + for (std::size_t i = 0; i < num; ++i) { + + if (exec_q != alloc_qs[i]) { + return false; + } + } + return true; +} + +/*! @brief Check if all allocation queues of usm_ndarays are the same as + the execution queue */ +template +bool queues_are_compatible(const sycl::queue &exec_q, + const ::dpctl::tensor::usm_ndarray (&arrs)[num]) +{ + for (std::size_t i = 0; i < num; ++i) { + + if (exec_q != arrs[i].get_queue()) { + return false; + } + } + return true; +} +} // end namespace utils +} // end namespace dpctl diff --git a/dpnp/dpnp_algo/dpnp_arraycreation.py b/dpnp/dpnp_algo/dpnp_arraycreation.py index d94a031801f3..9c9110b85384 100644 --- a/dpnp/dpnp_algo/dpnp_arraycreation.py +++ b/dpnp/dpnp_algo/dpnp_arraycreation.py @@ -29,11 +29,10 @@ import math import operator -import dpctl.tensor as dpt -import dpctl.utils as dpu import numpy import dpnp +import dpnp.tensor as dpt from dpnp.dpnp_array import dpnp_array from dpnp.dpnp_utils import get_usm_allocations, map_dtype_to_device @@ -46,7 +45,7 @@ def _as_usm_ndarray(a, usm_type, sycl_queue): - """Converts input object to `dpctl.tensor.usm_ndarray`""" + """Converts input object to `dpnp.tensor.usm_ndarray`""" if isinstance(a, dpnp_array): a = a.get_array() @@ -340,7 +339,7 @@ class dpnp_nd_grid: def __init__( self, sparse=False, device=None, usm_type="device", sycl_queue=None ): - dpu.validate_usm_type(usm_type, allow_none=True) + dpt.validate_usm_type(usm_type, allow_none=True) self.sparse = sparse self.usm_type = "device" if usm_type is None else usm_type self.sycl_queue_normalized = dpnp.get_normalized_queue_device( diff --git a/dpnp/dpnp_algo/dpnp_elementwise_common.py b/dpnp/dpnp_algo/dpnp_elementwise_common.py index 57bf50422fa0..96db4b4fe4e0 100644 --- a/dpnp/dpnp_algo/dpnp_elementwise_common.py +++ b/dpnp/dpnp_algo/dpnp_elementwise_common.py @@ -29,30 +29,32 @@ import warnings from functools import wraps -import dpctl.tensor as dpt -import dpctl.tensor._copy_utils as dtc -import dpctl.tensor._tensor_impl as dti -import dpctl.tensor._type_utils as dtu import dpctl.utils as dpu import numpy -from dpctl.tensor._elementwise_common import ( - BinaryElementwiseFunc, - UnaryElementwiseFunc, -) -from dpctl.tensor._scalar_utils import ( - _get_dtype, - _get_shape, - _validate_dtype, -) import dpnp import dpnp.backend.extensions.vm._vm_impl as vmi + +# pylint: disable=no-name-in-module +import dpnp.tensor as dpt +import dpnp.tensor._copy_utils as dtc +import dpnp.tensor._tensor_impl as dti +import dpnp.tensor._type_utils as dtu from dpnp.dpnp_array import dpnp_array from dpnp.dpnp_utils import get_usm_allocations from dpnp.dpnp_utils.dpnp_utils_common import ( find_buf_dtype_3out, find_buf_dtype_4out, ) +from dpnp.tensor._elementwise_common import ( + BinaryElementwiseFunc, + UnaryElementwiseFunc, +) +from dpnp.tensor._scalar_utils import ( + _get_dtype, + _get_shape, + _validate_dtype, +) __all__ = [ "DPNPI0", @@ -117,7 +119,7 @@ class DPNPUnaryFunc(UnaryElementwiseFunc): sycl_dev - The :class:`dpctl.SyclDevice` where the function evaluation is carried out. The function is invoked when the argument of the unary function - requires casting, e.g. the argument of `dpctl.tensor.log` is an + requires casting, e.g. the argument of `dpnp.tensor.log` is an array with integral data type. """ @@ -135,7 +137,7 @@ def __init__( def _call_func(src, dst, sycl_queue, depends=None): """ A callback to register in UnaryElementwiseFunc class of - dpctl.tensor + dpnp.tensor """ if depends is None: @@ -449,7 +451,7 @@ def __call__( f"Expected output shape is {x.shape}, got {res.shape}" ) - if dpu.get_execution_queue((exec_q, res.sycl_queue)) is None: + if dpt.get_execution_queue((exec_q, res.sycl_queue)) is None: raise dpnp.exceptions.ExecutionPlacementError( "Input and output allocation queues are not compatible" ) @@ -586,7 +588,7 @@ class DPNPBinaryFunc(BinaryElementwiseFunc): evaluation is carried out. The function is only called when both arguments of the binary function require casting, e.g. both arguments of - `dpctl.tensor.logaddexp` are arrays with integral data type. + `dpnp.tensor.logaddexp` are arrays with integral data type. weak_type_resolver : {None, callable}, optional Function to influence type promotion behavior for Python scalar types of this binary function. The function takes 3 arguments: @@ -613,7 +615,7 @@ def __init__( def _call_func(src1, src2, dst, sycl_queue, depends=None): """ A callback to register in UnaryElementwiseFunc class of - dpctl.tensor + dpnp.tensor """ if depends is None: @@ -1060,7 +1062,7 @@ def __call__( f"Expected output shape is {res_shape}, got {res.shape}" ) - if dpu.get_execution_queue((exec_q, res.sycl_queue)) is None: + if dpt.get_execution_queue((exec_q, res.sycl_queue)) is None: raise dpnp.exceptions.ExecutionPlacementError( "Input and output allocation queues are not compatible" ) diff --git a/dpnp/dpnp_algo/dpnp_fill.py b/dpnp/dpnp_algo/dpnp_fill.py index c3bfa8fa2e80..84aa9e47b27e 100644 --- a/dpnp/dpnp_algo/dpnp_fill.py +++ b/dpnp/dpnp_algo/dpnp_fill.py @@ -28,18 +28,18 @@ from numbers import Number -import dpctl.tensor as dpt import dpctl.utils as dpu -from dpctl.tensor._ctors import _cast_fill_val -from dpctl.tensor._tensor_impl import ( + +import dpnp +import dpnp.tensor as dpt +from dpnp.exceptions import ExecutionPlacementError +from dpnp.tensor._ctors import _cast_fill_val +from dpnp.tensor._tensor_impl import ( _copy_usm_ndarray_into_usm_ndarray, _full_usm_ndarray, _zeros_usm_ndarray, ) -import dpnp -from dpnp.exceptions import ExecutionPlacementError - def dpnp_fill(arr, val): arr = dpnp.get_usm_ndarray(arr) @@ -50,7 +50,7 @@ def dpnp_fill(arr, val): val = dpnp.get_usm_ndarray(val) if val.shape != (): raise ValueError("`val` must be a scalar or 0D-array") - if dpu.get_execution_queue((exec_q, val.sycl_queue)) is None: + if dpt.get_execution_queue((exec_q, val.sycl_queue)) is None: raise ExecutionPlacementError( "Input arrays have incompatible queues." ) diff --git a/dpnp/dpnp_array.py b/dpnp/dpnp_array.py index 951f782c3007..00a1b2d00e5d 100644 --- a/dpnp/dpnp_array.py +++ b/dpnp/dpnp_array.py @@ -32,15 +32,15 @@ """ +# pylint: disable=duplicate-code # pylint: disable=invalid-name # pylint: disable=protected-access import warnings -import dpctl.tensor as dpt -import dpctl.tensor._type_utils as dtu - import dpnp +import dpnp.tensor as dpt +import dpnp.tensor._type_utils as dtu from . import memory as dpm from .exceptions import AxisError @@ -72,7 +72,7 @@ class dpnp_array: An array object represents a multidimensional tensor of numeric elements stored in a USM allocation on a SYCL device. - This is a wrapper around :class:`dpctl.tensor.usm_ndarray` that provides + This is a wrapper around :class:`dpnp.tensor.usm_ndarray` that provides methods to be compliant with original NumPy. """ @@ -609,12 +609,12 @@ def __usm_ndarray__(self): """ Property to support ``__usm_ndarray__`` protocol. - It assumes to return :class:`dpctl.tensor.usm_ndarray` instance + It assumes to return :class:`dpnp.tensor.usm_ndarray` instance corresponding to the content of the object. This property is intended to speed-up conversion from - :class:`dpnp.ndarray` to :class:`dpctl.tensor.usm_ndarray` passed into - :func:`dpctl.tensor.asarray` function. The input object that implements + :class:`dpnp.ndarray` to :class:`dpnp.tensor.usm_ndarray` passed into + :func:`dpnp.tensor.asarray` function. The input object that implements ``__usm_ndarray__`` protocol is recognized as owner of USM allocation that is managed by a smart pointer, and asynchronous deallocation will not involve GIL. @@ -631,13 +631,13 @@ def __xor__(self, other, /): def _create_from_usm_ndarray(usm_ary: dpt.usm_ndarray): """ Return :class:`dpnp.ndarray` instance from USM allocation providing - by an instance of :class:`dpctl.tensor.usm_ndarray`. + by an instance of :class:`dpnp.tensor.usm_ndarray`. """ if not isinstance(usm_ary, dpt.usm_ndarray): raise TypeError( - f"Expected dpctl.tensor.usm_ndarray, got {type(usm_ary)}" + f"Expected dpnp.tensor.usm_ndarray, got {type(usm_ary)}" ) res = dpnp_array.__new__(dpnp_array) res._array_obj = usm_ary @@ -956,7 +956,7 @@ def astype( `device` can be ``None``, a oneAPI filter selector string, an instance of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL device, an instance of - :class:`dpctl.SyclQueue`, or a :class:`dpctl.tensor.Device` object + :class:`dpctl.SyclQueue`, or a :class:`dpnp.tensor.Device` object returned by :attr:`dpnp.ndarray.device`. If the value is ``None``, returned array is created on the same device as that array. @@ -1067,7 +1067,7 @@ def copy( `device` can be ``None``, a oneAPI filter selector string, an instance of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL device, an instance of - :class:`dpctl.SyclQueue`, or a :class:`dpctl.tensor.Device` object + :class:`dpctl.SyclQueue`, or a :class:`dpnp.tensor.Device` object returned by :attr:`dpnp.ndarray.device`. Default: ``None``. @@ -1162,7 +1162,7 @@ def data(self): @property def device(self): """ - Return :class:`dpctl.tensor.Device` object representing residence of + Return :class:`dpnp.tensor.Device` object representing residence of the array data. The ``Device`` object represents Array API notion of the device, and @@ -1329,7 +1329,7 @@ def flatten(self, /, order="C"): return self.reshape(-1, order=order, copy=True) def get_array(self): - """Get :class:`dpctl.tensor.usm_ndarray` object.""" + """Get :class:`dpnp.tensor.usm_ndarray` object.""" return self._array_obj # 'getfield', @@ -2182,7 +2182,7 @@ def to_device(self, device, /, *, stream=None): `device` can be ``None``, a oneAPI filter selector string, an instance of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL device, an instance of - :class:`dpctl.SyclQueue`, or a :class:`dpctl.tensor.Device` object + :class:`dpctl.SyclQueue`, or a :class:`dpnp.tensor.Device` object returned by :attr:`dpnp.ndarray.device`. stream : {SyclQueue, None}, optional Execution queue to synchronize with. If ``None``, synchronization diff --git a/dpnp/dpnp_array_api_info.py b/dpnp/dpnp_array_api_info.py index 6a3939d046b0..ef3f1e4c2b60 100644 --- a/dpnp/dpnp_array_api_info.py +++ b/dpnp/dpnp_array_api_info.py @@ -36,7 +36,7 @@ """ -import dpctl.tensor as dpt +import dpnp.tensor as dpt def __array_namespace_info__(): diff --git a/dpnp/dpnp_container.py b/dpnp/dpnp_container.py index 4975db17c717..4b38c2915178 100644 --- a/dpnp/dpnp_container.py +++ b/dpnp/dpnp_container.py @@ -35,10 +35,8 @@ """ -import dpctl.tensor as dpt -import dpctl.utils as dpu - import dpnp +import dpnp.tensor as dpt from dpnp.dpnp_array import dpnp_array __all__ = [ @@ -66,8 +64,8 @@ def arange( usm_type="device", sycl_queue=None, ): - """Validate input parameters before passing them into `dpctl.tensor` module""" - dpu.validate_usm_type(usm_type, allow_none=False) + """Validate input parameters before passing them into `dpnp.tensor` module""" + dpt.validate_usm_type(usm_type, allow_none=False) sycl_queue_normalized = dpnp.get_normalized_queue_device( sycl_queue=sycl_queue, device=device ) @@ -93,7 +91,7 @@ def asarray( sycl_queue=None, ): """Converts `x1` to `dpnp_array`.""" - dpu.validate_usm_type(usm_type, allow_none=True) + dpt.validate_usm_type(usm_type, allow_none=True) if order is None: order = "K" @@ -153,8 +151,8 @@ def empty( usm_type="device", sycl_queue=None, ): - """Validate input parameters before passing them into `dpctl.tensor` module""" - dpu.validate_usm_type(usm_type, allow_none=False) + """Validate input parameters before passing them into `dpnp.tensor` module""" + dpt.validate_usm_type(usm_type, allow_none=False) sycl_queue_normalized = dpnp.get_normalized_queue_device( sycl_queue=sycl_queue, device=device ) @@ -184,8 +182,8 @@ def eye( usm_type="device", sycl_queue=None, ): - """Validate input parameters before passing them into `dpctl.tensor` module""" - dpu.validate_usm_type(usm_type, allow_none=False) + """Validate input parameters before passing them into `dpnp.tensor` module""" + dpt.validate_usm_type(usm_type, allow_none=False) sycl_queue_normalized = dpnp.get_normalized_queue_device( sycl_queue=sycl_queue, device=device ) @@ -215,8 +213,8 @@ def full( usm_type=None, sycl_queue=None, ): - """Validate input parameters before passing them into `dpctl.tensor` module""" - dpu.validate_usm_type(usm_type, allow_none=True) + """Validate input parameters before passing them into `dpnp.tensor` module""" + dpt.validate_usm_type(usm_type, allow_none=True) sycl_queue_normalized = dpnp.get_normalized_queue_device( fill_value, sycl_queue=sycl_queue, device=device @@ -248,8 +246,8 @@ def ones( usm_type="device", sycl_queue=None, ): - """Validate input parameters before passing them into `dpctl.tensor` module""" - dpu.validate_usm_type(usm_type, allow_none=False) + """Validate input parameters before passing them into `dpnp.tensor` module""" + dpt.validate_usm_type(usm_type, allow_none=False) sycl_queue_normalized = dpnp.get_normalized_queue_device( sycl_queue=sycl_queue, device=device ) @@ -288,8 +286,8 @@ def zeros( usm_type="device", sycl_queue=None, ): - """Validate input parameters before passing them into `dpctl.tensor` module""" - dpu.validate_usm_type(usm_type, allow_none=False) + """Validate input parameters before passing them into `dpnp.tensor` module""" + dpt.validate_usm_type(usm_type, allow_none=False) sycl_queue_normalized = dpnp.get_normalized_queue_device( sycl_queue=sycl_queue, device=device ) diff --git a/dpnp/dpnp_iface.py b/dpnp/dpnp_iface.py index fba1a215756a..c9d16a20e83d 100644 --- a/dpnp/dpnp_iface.py +++ b/dpnp/dpnp_iface.py @@ -40,26 +40,27 @@ """ # pylint: disable=protected-access +# pylint: disable=no-name-in-module import os import dpctl -import dpctl.tensor as dpt -import dpctl.tensor._tensor_impl as ti import dpctl.utils as dpu import numpy -from dpctl.tensor._device import normalize_queue_device import dpnp -from .dpnp_array import dpnp_array - # pylint: disable=no-name-in-module +import dpnp.tensor as dpt +import dpnp.tensor._tensor_impl as ti + +from .dpnp_array import dpnp_array from .dpnp_utils import ( dpnp_descriptor, map_dtype_to_device, use_origin_backend, ) +from .tensor._device import normalize_queue_device def are_same_logical_tensors(ar1, ar2): @@ -141,7 +142,7 @@ def asnumpy(a, order="C"): def as_usm_ndarray(a, dtype=None, device=None, usm_type=None, sycl_queue=None): """ - Return :class:`dpctl.tensor.usm_ndarray` from input object `a`. + Return :class:`dpnp.tensor.usm_ndarray` from input object `a`. Parameters ---------- @@ -158,7 +159,7 @@ def as_usm_ndarray(a, dtype=None, device=None, usm_type=None, sycl_queue=None): `device` can be ``None``, a oneAPI filter selector string, an instance of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL device, an instance of :class:`dpctl.SyclQueue`, or a - :class:`dpctl.tensor.Device` object returned by + :class:`dpnp.tensor.Device` object returned by :attr:`dpnp.ndarray.device`. If the value is ``None``, returned array is created on the same device as `a`. @@ -179,7 +180,7 @@ def as_usm_ndarray(a, dtype=None, device=None, usm_type=None, sycl_queue=None): out : usm_ndarray A dpctl USM ndarray from input array or scalar `a`. If `a` is instance of :class:`dpnp.ndarray` - or :class:`dpctl.tensor.usm_ndarray`, no array allocation will be done + or :class:`dpnp.tensor.usm_ndarray`, no array allocation will be done and `dtype`, `device`, `usm_type`, `sycl_queue` keywords will be ignored. @@ -255,7 +256,7 @@ def check_limitations( def check_supported_arrays_type(*arrays, scalar_type=False, all_scalars=False): """ Return ``True`` if each array has either type of scalar, - :class:`dpnp.ndarray` or :class:`dpctl.tensor.usm_ndarray`. + :class:`dpnp.ndarray` or :class:`dpnp.tensor.usm_ndarray`. But if any array has unsupported type, ``TypeError`` will be raised. Parameters @@ -317,7 +318,7 @@ def default_float_type(device=None, sycl_queue=None): `device` can be ``None``, a oneAPI filter selector string, an instance of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL device, an instance of :class:`dpctl.SyclQueue`, or a - :class:`dpctl.tensor.Device` object returned by + :class:`dpnp.tensor.Device` object returned by :attr:`dpnp.ndarray.device`. The value ``None`` is interpreted as to use a default device. @@ -406,7 +407,7 @@ def get_dpnp_descriptor( if queue is not None and copy_when_nondefault_queue: default_queue = dpctl.SyclQueue() queue_is_default = ( - dpctl.utils.get_execution_queue([queue, default_queue]) is not None + dpt.get_execution_queue([queue, default_queue]) is not None ) if not queue_is_default: ext_obj = dpnp.array(ext_obj, sycl_queue=default_queue) @@ -433,7 +434,7 @@ def get_include(): def get_normalized_queue_device(obj=None, device=None, sycl_queue=None): """ Utility to process complementary keyword arguments 'device' and 'sycl_queue' - in subsequent calls of functions from `dpctl.tensor` module. + in subsequent calls of functions from `dpnp.tensor` module. If both arguments 'device' and 'sycl_queue' have default value ``None`` and 'obj' has `sycl_queue` attribute, it assumes that Compute Follows Data @@ -444,7 +445,7 @@ def get_normalized_queue_device(obj=None, device=None, sycl_queue=None): ---------- obj : object, optional A python object. Can be an instance of `dpnp_array`, - `dpctl.tensor.usm_ndarray`, an object representing SYCL USM allocation + `dpnp.tensor.usm_ndarray`, an object representing SYCL USM allocation and implementing `__sycl_usm_array_interface__` protocol, an instance of `numpy.ndarray`, an object supporting Python buffer protocol, a Python scalar, or a (possibly nested) sequence of Python scalars. @@ -461,7 +462,7 @@ def get_normalized_queue_device(obj=None, device=None, sycl_queue=None): `device` can be ``None``, a oneAPI filter selector string, an instance of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL device, an instance of :class:`dpctl.SyclQueue`, or a - :class:`dpctl.tensor.Device` object returned by + :class:`dpnp.tensor.Device` object returned by :attr:`dpnp.ndarray.device`. The value ``None`` is interpreted as to use the same device as `obj`. @@ -471,7 +472,7 @@ def get_normalized_queue_device(obj=None, device=None, sycl_queue=None): ------- sycl_queue: dpctl.SyclQueue A :class:`dpctl.SyclQueue` object normalized by - `normalize_queue_device` call of `dpctl.tensor` module invoked with + `normalize_queue_device` call of `dpnp.tensor` module invoked with `device` and `sycl_queue` values. If both incoming `device` and `sycl_queue` are ``None`` and `obj` has `sycl_queue` attribute, the normalization will be performed for `obj.sycl_queue` value. @@ -539,13 +540,13 @@ def get_result_array(a, out=None, casting="safe"): def get_usm_ndarray(a): """ - Return :class:`dpctl.tensor.usm_ndarray` from input array `a`. + Return :class:`dpnp.tensor.usm_ndarray` from input array `a`. Parameters ---------- a : {dpnp.ndarray, usm_ndarray} Input array of supported type :class:`dpnp.ndarray` - or :class:`dpctl.tensor.usm_ndarray`. + or :class:`dpnp.tensor.usm_ndarray`. Returns ------- @@ -570,13 +571,13 @@ def get_usm_ndarray(a): def get_usm_ndarray_or_scalar(a): """ - Return scalar or :class:`dpctl.tensor.usm_ndarray` from input object `a`. + Return scalar or :class:`dpnp.tensor.usm_ndarray` from input object `a`. Parameters ---------- a : {scalar, dpnp_array, usm_ndarray} Input of any supported type: scalar, :class:`dpnp.ndarray` - or :class:`dpctl.tensor.usm_ndarray`. + or :class:`dpnp.tensor.usm_ndarray`. Returns ------- @@ -633,7 +634,7 @@ def is_cuda_backend(obj=None): def is_supported_array_or_scalar(a): """ Return ``True`` if `a` is a scalar or an array of either - :class:`dpnp.ndarray` or :class:`dpctl.tensor.usm_ndarray` type, + :class:`dpnp.ndarray` or :class:`dpnp.tensor.usm_ndarray` type, ``False`` otherwise. Parameters @@ -655,7 +656,7 @@ def is_supported_array_or_scalar(a): def is_supported_array_type(a): """ Return ``True`` if an array of either type :class:`dpnp.ndarray` - or :class:`dpctl.tensor.usm_ndarray` type, ``False`` otherwise. + or :class:`dpnp.tensor.usm_ndarray` type, ``False`` otherwise. Parameters ---------- diff --git a/dpnp/dpnp_iface_arraycreation.py b/dpnp/dpnp_iface_arraycreation.py index 5bcf5ea19b82..da6b45517eb3 100644 --- a/dpnp/dpnp_iface_arraycreation.py +++ b/dpnp/dpnp_iface_arraycreation.py @@ -43,10 +43,10 @@ import operator -import dpctl.tensor as dpt import numpy import dpnp +import dpnp.tensor as dpt from dpnp import dpnp_container from .dpnp_algo.dpnp_arraycreation import ( @@ -175,7 +175,7 @@ def arange( `device` can be ``None``, a oneAPI filter selector string, an instance of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL device, an instance of :class:`dpctl.SyclQueue`, or a - :class:`dpctl.tensor.Device` object returned by + :class:`dpnp.tensor.Device` object returned by :attr:`dpnp.ndarray.device`. Default: ``None``. @@ -295,7 +295,7 @@ def array( `device` can be ``None``, a oneAPI filter selector string, an instance of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL device, an instance of :class:`dpctl.SyclQueue`, or a - :class:`dpctl.tensor.Device` object returned by + :class:`dpnp.tensor.Device` object returned by :attr:`dpnp.ndarray.device`. Default: ``None``. @@ -440,7 +440,7 @@ def asanyarray( `device` can be ``None``, a oneAPI filter selector string, an instance of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL device, an instance of :class:`dpctl.SyclQueue`, or a - :class:`dpctl.tensor.Device` object returned by + :class:`dpnp.tensor.Device` object returned by :attr:`dpnp.ndarray.device`. Default: ``None``. @@ -545,7 +545,7 @@ def asarray( `device` can be ``None``, a oneAPI filter selector string, an instance of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL device, an instance of :class:`dpctl.SyclQueue`, or a - :class:`dpctl.tensor.Device` object returned by + :class:`dpnp.tensor.Device` object returned by :attr:`dpnp.ndarray.device`. Default: ``None``. @@ -646,7 +646,7 @@ def ascontiguousarray( `device` can be ``None``, a oneAPI filter selector string, an instance of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL device, an instance of :class:`dpctl.SyclQueue`, or a - :class:`dpctl.tensor.Device` object returned by + :class:`dpnp.tensor.Device` object returned by :attr:`dpnp.ndarray.device`. Default: ``None``. @@ -760,7 +760,7 @@ def asfortranarray( `device` can be ``None``, a oneAPI filter selector string, an instance of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL device, an instance of :class:`dpctl.SyclQueue`, or a - :class:`dpctl.tensor.Device` object returned by + :class:`dpnp.tensor.Device` object returned by :attr:`dpnp.ndarray.device`. Default: ``None``. @@ -897,7 +897,7 @@ def astype(x, dtype, /, *, order="K", casting="unsafe", copy=True, device=None): `device` can be ``None``, a oneAPI filter selector string, an instance of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL device, an instance of :class:`dpctl.SyclQueue`, or a - :class:`dpctl.tensor.Device` object returned by + :class:`dpnp.tensor.Device` object returned by :attr:`dpnp.ndarray.device`. If the value is ``None``, returned array is created on the same device as `x`. @@ -966,7 +966,7 @@ def copy( `device` can be ``None``, a oneAPI filter selector string, an instance of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL device, an instance of :class:`dpctl.SyclQueue`, or a - :class:`dpctl.tensor.Device` object returned by + :class:`dpnp.tensor.Device` object returned by :attr:`dpnp.ndarray.device`. Default: ``None``. @@ -1086,7 +1086,7 @@ def diag(v, /, k=0, *, device=None, usm_type=None, sycl_queue=None): `device` can be ``None``, a oneAPI filter selector string, an instance of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL device, an instance of :class:`dpctl.SyclQueue`, or a - :class:`dpctl.tensor.Device` object returned by + :class:`dpnp.tensor.Device` object returned by :attr:`dpnp.ndarray.device`. Default: ``None``. @@ -1191,7 +1191,7 @@ def diagflat(v, /, k=0, *, device=None, usm_type=None, sycl_queue=None): `device` can be ``None``, a oneAPI filter selector string, an instance of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL device, an instance of :class:`dpctl.SyclQueue`, or a - :class:`dpctl.tensor.Device` object returned by + :class:`dpnp.tensor.Device` object returned by :attr:`dpnp.ndarray.device`. Default: ``None``. @@ -1297,7 +1297,7 @@ def empty( `device` can be ``None``, a oneAPI filter selector string, an instance of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL device, an instance of :class:`dpctl.SyclQueue`, or a - :class:`dpctl.tensor.Device` object returned by + :class:`dpnp.tensor.Device` object returned by :attr:`dpnp.ndarray.device`. Default: ``None``. @@ -1403,7 +1403,7 @@ def empty_like( `device` can be ``None``, a oneAPI filter selector string, an instance of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL device, an instance of :class:`dpctl.SyclQueue`, or a - :class:`dpctl.tensor.Device` object returned by + :class:`dpnp.tensor.Device` object returned by :attr:`dpnp.ndarray.device`. Default: ``None``. @@ -1515,7 +1515,7 @@ def eye( `device` can be ``None``, a oneAPI filter selector string, an instance of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL device, an instance of :class:`dpctl.SyclQueue`, or a - :class:`dpctl.tensor.Device` object returned by + :class:`dpnp.tensor.Device` object returned by :attr:`dpnp.ndarray.device`. Default: ``None``. @@ -1627,7 +1627,7 @@ def frombuffer( `device` can be ``None``, a oneAPI filter selector string, an instance of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL device, an instance of :class:`dpctl.SyclQueue`, or a - :class:`dpctl.tensor.Device` object returned by + :class:`dpnp.tensor.Device` object returned by :attr:`dpnp.ndarray.device`. Default: ``None``. @@ -1747,7 +1747,7 @@ def fromfile( `device` can be ``None``, a oneAPI filter selector string, an instance of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL device, an instance of :class:`dpctl.SyclQueue`, or a - :class:`dpctl.tensor.Device` object returned by + :class:`dpnp.tensor.Device` object returned by :attr:`dpnp.ndarray.device`. Default: ``None``. @@ -1868,7 +1868,7 @@ def fromfunction( `device` can be ``None``, a oneAPI filter selector string, an instance of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL device, an instance of :class:`dpctl.SyclQueue`, or a - :class:`dpctl.tensor.Device` object returned by + :class:`dpnp.tensor.Device` object returned by :attr:`dpnp.ndarray.device`. Default: ``None``. @@ -1979,7 +1979,7 @@ def fromiter( `device` can be ``None``, a oneAPI filter selector string, an instance of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL device, an instance of :class:`dpctl.SyclQueue`, or a - :class:`dpctl.tensor.Device` object returned by + :class:`dpnp.tensor.Device` object returned by :attr:`dpnp.ndarray.device`. Default: ``None``. @@ -2081,7 +2081,7 @@ def fromstring( `device` can be ``None``, a oneAPI filter selector string, an instance of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL device, an instance of :class:`dpctl.SyclQueue`, or a - :class:`dpctl.tensor.Device` object returned by + :class:`dpnp.tensor.Device` object returned by :attr:`dpnp.ndarray.device`. Default: ``None``. @@ -2158,9 +2158,9 @@ def from_dlpack(x, /, *, device=None, copy=None): to a non-partitioned SYCL device. * :class:`dpctl.SyclQueue` : Implies SYCL device targeted by the SYCL queue. - * :class:`dpctl.tensor.Device` : Implies SYCL device + * :class:`dpnp.tensor.Device` : Implies SYCL device ``device.sycl_queue``. The `device` object is obtained via - :attr:`dpctl.tensor.usm_ndarray.device`. + :attr:`dpnp.tensor.usm_ndarray.device`. * ``(device_type, device_id)`` : 2-tuple matching the format of the output of the :meth:`dpnp.ndarray.__dlpack_device__`: an integer enumerator representing the device type followed by an integer @@ -2205,7 +2205,7 @@ def from_dlpack(x, /, *, device=None, copy=None): If the return type is :class:`dpnp.ndarray`, the associated SYCL queue is derived from the `device` keyword. When `device` keyword value has type :class:`dpctl.SyclQueue`, the explicit queue instance is used, when `device` - keyword value has type :class:`dpctl.tensor.Device`, the + keyword value has type :class:`dpnp.tensor.Device`, the ``device.sycl_queue`` is used. In all other cases, the cached SYCL queue corresponding to the implied SYCL device is used. @@ -2261,7 +2261,7 @@ def full( `device` can be ``None``, a oneAPI filter selector string, an instance of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL device, an instance of :class:`dpctl.SyclQueue`, or a - :class:`dpctl.tensor.Device` object returned by + :class:`dpnp.tensor.Device` object returned by :attr:`dpnp.ndarray.device`. Default: ``None``. @@ -2370,7 +2370,7 @@ def full_like( `device` can be ``None``, a oneAPI filter selector string, an instance of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL device, an instance of :class:`dpctl.SyclQueue`, or a - :class:`dpctl.tensor.Device` object returned by + :class:`dpnp.tensor.Device` object returned by :attr:`dpnp.ndarray.device`. Default: ``None``. @@ -2485,7 +2485,7 @@ def geomspace( `device` can be ``None``, a oneAPI filter selector string, an instance of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL device, an instance of :class:`dpctl.SyclQueue`, or a - :class:`dpctl.tensor.Device` object returned by + :class:`dpnp.tensor.Device` object returned by :attr:`dpnp.ndarray.device`. Default: ``None``. @@ -2597,7 +2597,7 @@ def identity( `device` can be ``None``, a oneAPI filter selector string, an instance of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL device, an instance of :class:`dpctl.SyclQueue`, or a - :class:`dpctl.tensor.Device` object returned by + :class:`dpnp.tensor.Device` object returned by :attr:`dpnp.ndarray.device`. Default: ``None``. @@ -2715,7 +2715,7 @@ def linspace( `device` can be ``None``, a oneAPI filter selector string, an instance of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL device, an instance of :class:`dpctl.SyclQueue`, or a - :class:`dpctl.tensor.Device` object returned by + :class:`dpnp.tensor.Device` object returned by :attr:`dpnp.ndarray.device`. Default: ``None``. @@ -2827,7 +2827,7 @@ def loadtxt( `device` can be ``None``, a oneAPI filter selector string, an instance of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL device, an instance of :class:`dpctl.SyclQueue`, or a - :class:`dpctl.tensor.Device` object returned by + :class:`dpnp.tensor.Device` object returned by :attr:`dpnp.ndarray.device`. Default: ``None``. @@ -2942,7 +2942,7 @@ def logspace( `device` can be ``None``, a oneAPI filter selector string, an instance of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL device, an instance of :class:`dpctl.SyclQueue`, or a - :class:`dpctl.tensor.Device` object returned by + :class:`dpnp.tensor.Device` object returned by :attr:`dpnp.ndarray.device`. Default: ``None``. @@ -3148,7 +3148,7 @@ class MGridClass: `device` can be ``None``, a oneAPI filter selector string, an instance of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL device, an instance of :class:`dpctl.SyclQueue`, or a - :class:`dpctl.tensor.Device` object returned by + :class:`dpnp.tensor.Device` object returned by :attr:`dpnp.ndarray.device`. Default: ``None``. @@ -3227,7 +3227,7 @@ class OGridClass: `device` can be ``None``, a oneAPI filter selector string, an instance of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL device, an instance of :class:`dpctl.SyclQueue`, or a - :class:`dpctl.tensor.Device` object returned by + :class:`dpnp.tensor.Device` object returned by :attr:`dpnp.ndarray.device`. Default: ``None``. @@ -3317,7 +3317,7 @@ def ones( `device` can be ``None``, a oneAPI filter selector string, an instance of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL device, an instance of :class:`dpctl.SyclQueue`, or a - :class:`dpctl.tensor.Device` object returned by + :class:`dpnp.tensor.Device` object returned by :attr:`dpnp.ndarray.device`. Default: ``None``. @@ -3429,7 +3429,7 @@ def ones_like( `device` can be ``None``, a oneAPI filter selector string, an instance of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL device, an instance of :class:`dpctl.SyclQueue`, or a - :class:`dpctl.tensor.Device` object returned by + :class:`dpnp.tensor.Device` object returned by :attr:`dpnp.ndarray.device`. Default: ``None``. @@ -3602,7 +3602,7 @@ def tri( `device` can be ``None``, a oneAPI filter selector string, an instance of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL device, an instance of :class:`dpctl.SyclQueue`, or a - :class:`dpctl.tensor.Device` object returned by + :class:`dpnp.tensor.Device` object returned by :attr:`dpnp.ndarray.device`. Default: ``None``. @@ -3840,7 +3840,7 @@ def vander( `device` can be ``None``, a oneAPI filter selector string, an instance of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL device, an instance of :class:`dpctl.SyclQueue`, or a - :class:`dpctl.tensor.Device` object returned by + :class:`dpnp.tensor.Device` object returned by :attr:`dpnp.ndarray.device`. Default: ``None``. @@ -3970,7 +3970,7 @@ def zeros( `device` can be ``None``, a oneAPI filter selector string, an instance of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL device, an instance of :class:`dpctl.SyclQueue`, or a - :class:`dpctl.tensor.Device` object returned by + :class:`dpnp.tensor.Device` object returned by :attr:`dpnp.ndarray.device`. Default: ``None``. @@ -4082,7 +4082,7 @@ def zeros_like( `device` can be ``None``, a oneAPI filter selector string, an instance of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL device, an instance of :class:`dpctl.SyclQueue`, or a - :class:`dpctl.tensor.Device` object returned by + :class:`dpnp.tensor.Device` object returned by :attr:`dpnp.ndarray.device`. Default: ``None``. diff --git a/dpnp/dpnp_iface_bitwise.py b/dpnp/dpnp_iface_bitwise.py index 733fbc697241..604fd365ee18 100644 --- a/dpnp/dpnp_iface_bitwise.py +++ b/dpnp/dpnp_iface_bitwise.py @@ -43,10 +43,10 @@ # pylint: disable=no-name-in-module # pylint: disable=protected-access -import dpctl.tensor._tensor_elementwise_impl as ti import numpy import dpnp.backend.extensions.ufunc._ufunc_impl as ufi +import dpnp.tensor._tensor_elementwise_impl as ti from dpnp.dpnp_algo.dpnp_elementwise_common import DPNPBinaryFunc, DPNPUnaryFunc diff --git a/dpnp/dpnp_iface_counting.py b/dpnp/dpnp_iface_counting.py index a4b85aa85294..7bb13422f819 100644 --- a/dpnp/dpnp_iface_counting.py +++ b/dpnp/dpnp_iface_counting.py @@ -39,9 +39,8 @@ """ -import dpctl.tensor as dpt - import dpnp +import dpnp.tensor as dpt def count_nonzero(a, axis=None, *, keepdims=False, out=None): diff --git a/dpnp/dpnp_iface_functional.py b/dpnp/dpnp_iface_functional.py index 1985eced2e71..0ed965b0698f 100644 --- a/dpnp/dpnp_iface_functional.py +++ b/dpnp/dpnp_iface_functional.py @@ -41,15 +41,14 @@ # pylint: disable=protected-access -from dpctl.tensor._numpy_helper import ( - normalize_axis_index, - normalize_axis_tuple, -) - import dpnp # pylint: disable=no-name-in-module -from dpnp.dpnp_utils import get_usm_allocations +from .dpnp_utils import get_usm_allocations +from .tensor._numpy_helper import ( + normalize_axis_index, + normalize_axis_tuple, +) def apply_along_axis(func1d, axis, arr, *args, **kwargs): diff --git a/dpnp/dpnp_iface_histograms.py b/dpnp/dpnp_iface_histograms.py index 0a2f18fe3644..7e91968926fc 100644 --- a/dpnp/dpnp_iface_histograms.py +++ b/dpnp/dpnp_iface_histograms.py @@ -53,6 +53,7 @@ result_type_for_device, to_supported_dtypes, ) +from dpnp.tensor import get_coerced_usm_type, get_execution_queue # pylint: disable=no-name-in-module from .dpnp_utils import get_usm_allocations @@ -87,10 +88,10 @@ def _ravel_check_a_and_weights(a, weights): if weights is not None: # check that `weights` array has supported type dpnp.check_supported_arrays_type(weights) - usm_type = dpu.get_coerced_usm_type([usm_type, weights.usm_type]) + usm_type = get_coerced_usm_type([usm_type, weights.usm_type]) # check that arrays have the same allocation queue - if dpu.get_execution_queue([a.sycl_queue, weights.sycl_queue]) is None: + if get_execution_queue([a.sycl_queue, weights.sycl_queue]) is None: raise ValueError( "a and weights must be allocated on the same SYCL queue" ) @@ -173,7 +174,7 @@ def _get_bin_edges(a, bins, range, usm_type): elif numpy.ndim(bins) == 1: if dpnp.is_supported_array_type(bins): - if dpu.get_execution_queue([a.sycl_queue, bins.sycl_queue]) is None: + if get_execution_queue([a.sycl_queue, bins.sycl_queue]) is None: raise ValueError( "a and bins must be allocated on the same SYCL queue" ) diff --git a/dpnp/dpnp_iface_indexing.py b/dpnp/dpnp_iface_indexing.py index 2a90f6cff637..1c9776582b73 100644 --- a/dpnp/dpnp_iface_indexing.py +++ b/dpnp/dpnp_iface_indexing.py @@ -44,19 +44,18 @@ import operator from collections.abc import Iterable -import dpctl.tensor as dpt -import dpctl.tensor._tensor_impl as ti import dpctl.utils as dpu import numpy -from dpctl.tensor._copy_utils import _nonzero_impl -from dpctl.tensor._indexing_functions import _get_indexing_mode -from dpctl.tensor._numpy_helper import normalize_axis_index import dpnp # pylint: disable=no-name-in-module import dpnp.backend.extensions.indexing._indexing_impl as indexing_ext +# pylint: disable=no-name-in-module +import dpnp.tensor as dpt +import dpnp.tensor._tensor_impl as ti + # pylint: disable=no-name-in-module from .dpnp_algo import ( dpnp_putmask, @@ -64,6 +63,9 @@ from .dpnp_array import dpnp_array from .dpnp_utils import call_origin, get_usm_allocations from .exceptions import ExecutionPlacementError +from .tensor._copy_utils import _nonzero_impl +from .tensor._indexing_functions import _get_indexing_mode +from .tensor._numpy_helper import normalize_axis_index def _ravel_multi_index_checks(multi_index, dims, order): @@ -99,7 +101,7 @@ def _build_choices_list(choices): list of arrays. If a single array of dimension greater than one, the array will be unstacked. - Returns a list of :class:`dpctl.tensor.usm_ndarray`s. + Returns a list of :class:`dpnp.tensor.usm_ndarray`s. """ if dpnp.is_supported_array_type(choices): @@ -129,7 +131,7 @@ def _choose_run(inds, chcs, q, usm_type, out=None, mode=0): f"got {out.dtype}" ) - if dpu.get_execution_queue((q, out.sycl_queue)) is None: + if dpt.get_execution_queue((q, out.sycl_queue)) is None: raise ExecutionPlacementError( "Input and output allocation queues are not compatible" ) @@ -291,7 +293,7 @@ def _take_index(x, inds, axis, q, usm_type, out=None, mode=0): f"Output array of type {x.dtype} is needed, " f"got {out.dtype}" ) - if dpu.get_execution_queue((q, out.sycl_queue)) is None: + if dpt.get_execution_queue((q, out.sycl_queue)) is None: raise ExecutionPlacementError( "Input and output allocation queues are not compatible" ) @@ -445,7 +447,7 @@ def diag_indices(n, ndim=2, device=None, usm_type="device", sycl_queue=None): `device` can be ``None``, a oneAPI filter selector string, an instance of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL device, an instance of :class:`dpctl.SyclQueue`, or a - :class:`dpctl.tensor.Device` object returned by + :class:`dpnp.tensor.Device` object returned by :attr:`dpnp.ndarray.device`. Default: ``None``. @@ -1044,7 +1046,7 @@ def indices( `device` can be ``None``, a oneAPI filter selector string, an instance of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL device, an instance of :class:`dpctl.SyclQueue`, or a - :class:`dpctl.tensor.Device` object returned by + :class:`dpnp.tensor.Device` object returned by :attr:`dpnp.ndarray.device`. Default: ``None``. @@ -1308,7 +1310,7 @@ def mask_indices( `device` can be ``None``, a oneAPI filter selector string, an instance of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL device, an instance of :class:`dpctl.SyclQueue`, or a - :class:`dpctl.tensor.Device` object returned by + :class:`dpnp.tensor.Device` object returned by :attr:`dpnp.ndarray.device`. Default: ``None``. @@ -2321,7 +2323,7 @@ def tril_indices( `device` can be ``None``, a oneAPI filter selector string, an instance of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL device, an instance of :class:`dpctl.SyclQueue`, or a - :class:`dpctl.tensor.Device` object returned by + :class:`dpnp.tensor.Device` object returned by :attr:`dpnp.ndarray.device`. Default: ``None``. @@ -2538,7 +2540,7 @@ def triu_indices( `device` can be ``None``, a oneAPI filter selector string, an instance of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL device, an instance of :class:`dpctl.SyclQueue`, or a - :class:`dpctl.tensor.Device` object returned by + :class:`dpnp.tensor.Device` object returned by :attr:`dpnp.ndarray.device`. Default: ``None``. diff --git a/dpnp/dpnp_iface_logic.py b/dpnp/dpnp_iface_logic.py index 6464bd49af1b..faa84dd538a4 100644 --- a/dpnp/dpnp_iface_logic.py +++ b/dpnp/dpnp_iface_logic.py @@ -43,16 +43,15 @@ # pylint: disable=duplicate-code # pylint: disable=no-name-in-module - -import dpctl.tensor as dpt -import dpctl.tensor._tensor_elementwise_impl as ti import dpctl.utils as dpu import numpy import dpnp import dpnp.backend.extensions.ufunc._ufunc_impl as ufi -from dpnp.dpnp_algo.dpnp_elementwise_common import DPNPBinaryFunc, DPNPUnaryFunc +import dpnp.tensor as dpt +import dpnp.tensor._tensor_elementwise_impl as ti +from .dpnp_algo.dpnp_elementwise_common import DPNPBinaryFunc, DPNPUnaryFunc from .dpnp_array import dpnp_array from .dpnp_utils import get_usm_allocations from .exceptions import ExecutionPlacementError @@ -1263,7 +1262,7 @@ def isin( usm_element = dpnp.get_usm_ndarray(element) else: if ( - dpu.get_execution_queue( + dpt.get_execution_queue( (element.sycl_queue, test_elements.sycl_queue) ) is None diff --git a/dpnp/dpnp_iface_manipulation.py b/dpnp/dpnp_iface_manipulation.py index 0594a406ac5a..b96d36a40e6a 100644 --- a/dpnp/dpnp_iface_manipulation.py +++ b/dpnp/dpnp_iface_manipulation.py @@ -45,14 +45,10 @@ from typing import NamedTuple import dpctl -import dpctl.tensor as dpt import numpy -from dpctl.tensor._numpy_helper import ( - normalize_axis_index, - normalize_axis_tuple, -) import dpnp +import dpnp.tensor as dpt from .dpnp_array import dpnp_array @@ -60,6 +56,10 @@ from .dpnp_utils import get_usm_allocations from .dpnp_utils.dpnp_utils_pad import dpnp_pad from .exceptions import AxisError +from .tensor._numpy_helper import ( + normalize_axis_index, + normalize_axis_tuple, +) class InsertDeleteParams(NamedTuple): @@ -692,7 +692,7 @@ def asarray_chkfinite( `device` can be ``None``, a oneAPI filter selector string, an instance of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL device, an instance of :class:`dpctl.SyclQueue`, or a - :class:`dpctl.tensor.Device` object returned by + :class:`dpnp.tensor.Device` object returned by :attr:`dpnp.ndarray.device`. Default: ``None``. @@ -791,7 +791,7 @@ def asfarray(a, dtype=None, *, device=None, usm_type=None, sycl_queue=None): a : array_like Input data, in any form that can be converted to an array. This includes an instance of :class:`dpnp.ndarray` or - :class:`dpctl.tensor.usm_ndarray`, an object representing + :class:`dpnp.tensor.usm_ndarray`, an object representing SYCL USM allocation and implementing `__sycl_usm_array_interface__` protocol, an instance of :class:`numpy.ndarray`, an object supporting Python buffer protocol, a Python scalar, or a (possibly nested) @@ -808,7 +808,7 @@ def asfarray(a, dtype=None, *, device=None, usm_type=None, sycl_queue=None): `device` can be ``None``, a oneAPI filter selector string, an instance of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL device, an instance of :class:`dpctl.SyclQueue`, or a - :class:`dpctl.tensor.Device` object returned by + :class:`dpnp.tensor.Device` object returned by :attr:`dpnp.ndarray.device`. Default: ``None``. diff --git a/dpnp/dpnp_iface_mathematical.py b/dpnp/dpnp_iface_mathematical.py index e06904a57bda..ddecd1f751d9 100644 --- a/dpnp/dpnp_iface_mathematical.py +++ b/dpnp/dpnp_iface_mathematical.py @@ -40,25 +40,21 @@ """ # pylint: disable=protected-access +# pylint: disable=duplicate-code # pylint: disable=no-name-in-module import builtins import warnings -import dpctl.tensor as dpt -import dpctl.tensor._tensor_elementwise_impl as ti -import dpctl.tensor._type_utils as dtu import dpctl.utils as dpu import numpy -from dpctl.tensor._numpy_helper import ( - normalize_axis_index, - normalize_axis_tuple, -) -from dpctl.tensor._type_utils import _acceptance_fn_divide import dpnp import dpnp.backend.extensions.ufunc._ufunc_impl as ufi +import dpnp.tensor as dpt +import dpnp.tensor._tensor_elementwise_impl as ti +import dpnp.tensor._type_utils as dtu from .dpnp_algo.dpnp_elementwise_common import ( DPNPI0, @@ -85,6 +81,10 @@ from .dpnp_utils.dpnp_utils_linearalgebra import dpnp_cross from .dpnp_utils.dpnp_utils_reduction import dpnp_wrap_reduction_call from .exceptions import ExecutionPlacementError +from .tensor._numpy_helper import ( + normalize_axis_index, + normalize_axis_tuple, +) def _get_max_min(dtype): @@ -273,9 +273,9 @@ def _process_ediff1d_args(arg, arg_name, ary_dtype, ary_sycl_queue, usm_type): if not dpnp.is_supported_array_type(arg): arg = dpnp.asarray(arg, usm_type=usm_type, sycl_queue=ary_sycl_queue) else: - usm_type = dpu.get_coerced_usm_type([usm_type, arg.usm_type]) + usm_type = dpt.get_coerced_usm_type([usm_type, arg.usm_type]) # check that arrays have the same allocation queue - if dpu.get_execution_queue([ary_sycl_queue, arg.sycl_queue]) is None: + if dpt.get_execution_queue([ary_sycl_queue, arg.sycl_queue]) is None: raise ExecutionPlacementError( f"ary and {arg_name} must be allocated on the same SYCL queue" ) @@ -307,7 +307,7 @@ def _validate_interp_param(param, name, exec_q, usm_type, dtype=None): f"a {name} value must be 0-dimensional, " f"but got {param.ndim}-dim" ) - if dpu.get_execution_queue([exec_q, param.sycl_queue]) is None: + if dpt.get_execution_queue([exec_q, param.sycl_queue]) is None: raise ValueError( f"input arrays and {name} must be allocated " "on the same SYCL queue" @@ -1564,7 +1564,7 @@ def diff(a, n=1, axis=-1, prepend=None, append=None): mkl_fn_to_call="_mkl_div_to_call", mkl_impl_fn="_div", binary_inplace_fn=ti._divide_inplace, - acceptance_fn=_acceptance_fn_divide, + acceptance_fn=dtu._acceptance_fn_divide, ) @@ -2724,7 +2724,7 @@ def gradient(f, *varargs, axis=None, edge_order=1): if dpnp.isscalar(ax_dx): usm_type = f.usm_type else: - usm_type = dpu.get_coerced_usm_type([f.usm_type, ax_dx.usm_type]) + usm_type = dpt.get_coerced_usm_type([f.usm_type, ax_dx.usm_type]) out = dpnp.empty_like(f, dtype=otype, usm_type=usm_type) # spacing for the current axis diff --git a/dpnp/dpnp_iface_nanfunctions.py b/dpnp/dpnp_iface_nanfunctions.py index a5fb750cf586..10fffb342305 100644 --- a/dpnp/dpnp_iface_nanfunctions.py +++ b/dpnp/dpnp_iface_nanfunctions.py @@ -167,7 +167,7 @@ def nanargmax(a, axis=None, out=None, *, keepdims=False): Limitations ----------- Input array is only supported as either :class:`dpnp.ndarray` - or :class:`dpctl.tensor.usm_ndarray`. + or :class:`dpnp.tensor.usm_ndarray`. Input array data types are limited by supported DPNP :ref:`Data types`. See Also @@ -251,7 +251,7 @@ def nanargmin(a, axis=None, out=None, *, keepdims=False): Limitations ----------- Input and output arrays are only supported as either :class:`dpnp.ndarray` - or :class:`dpctl.tensor.usm_ndarray`. + or :class:`dpnp.tensor.usm_ndarray`. Input array data types are limited by supported DPNP :ref:`Data types`. See Also @@ -466,7 +466,7 @@ def nanmax(a, axis=None, out=None, keepdims=False, initial=None, where=True): Limitations ----------- Input array is only supported as either :class:`dpnp.ndarray` - or :class:`dpctl.tensor.usm_ndarray`. + or :class:`dpnp.tensor.usm_ndarray`. Parameters `where`, and `initial` are only supported with their default values. Otherwise ``NotImplementedError`` exception will be raised. @@ -782,7 +782,7 @@ def nanmin(a, axis=None, out=None, keepdims=False, initial=None, where=True): Limitations ----------- Input array is only supported as either :class:`dpnp.ndarray` - or :class:`dpctl.tensor.usm_ndarray`. + or :class:`dpnp.tensor.usm_ndarray`. Parameters `where`, and `initial` are only supported with their default values. Otherwise ``NotImplementedError`` exception will be raised. @@ -896,7 +896,7 @@ def nanprod( Limitations ----------- Input array is only supported as either :class:`dpnp.ndarray` or - :class:`dpctl.tensor.usm_ndarray`. + :class:`dpnp.tensor.usm_ndarray`. Parameters `initial`, and `where` are only supported with their default values. Otherwise the function will be executed sequentially on CPU. diff --git a/dpnp/dpnp_iface_searching.py b/dpnp/dpnp_iface_searching.py index 6eefe010b699..856fdbc98936 100644 --- a/dpnp/dpnp_iface_searching.py +++ b/dpnp/dpnp_iface_searching.py @@ -39,11 +39,14 @@ """ -import dpctl.tensor as dpt -import dpctl.tensor._tensor_impl as dti +# pylint: disable=duplicate-code import dpnp +# pylint: disable=no-name-in-module +import dpnp.tensor as dpt +import dpnp.tensor._tensor_impl as dti + from .dpnp_array import dpnp_array from .dpnp_utils.dpnp_utils_reduction import dpnp_wrap_reduction_call diff --git a/dpnp/dpnp_iface_sorting.py b/dpnp/dpnp_iface_sorting.py index 9c5097a5f3e3..8f6f3e80f0d1 100644 --- a/dpnp/dpnp_iface_sorting.py +++ b/dpnp/dpnp_iface_sorting.py @@ -41,10 +41,8 @@ from collections.abc import Sequence -import dpctl.tensor as dpt -from dpctl.tensor._numpy_helper import normalize_axis_index - import dpnp +import dpnp.tensor as dpt # pylint: disable=no-name-in-module from .dpnp_algo import ( @@ -54,6 +52,7 @@ from .dpnp_utils import ( map_dtype_to_device, ) +from .tensor._numpy_helper import normalize_axis_index def _wrap_sort_argsort( @@ -65,7 +64,7 @@ def _wrap_sort_argsort( descending=False, stable=True, ): - """Wrap a sorting call from dpctl.tensor interface.""" + """Wrap a sorting call from dpnp.tensor interface.""" if order is not None: raise NotImplementedError( diff --git a/dpnp/dpnp_iface_statistics.py b/dpnp/dpnp_iface_statistics.py index 7e092184366c..bf27fc98a4ce 100644 --- a/dpnp/dpnp_iface_statistics.py +++ b/dpnp/dpnp_iface_statistics.py @@ -39,26 +39,26 @@ """ +# pylint: disable=no-name-in-module + import math -import dpctl.tensor as dpt -import dpctl.tensor._tensor_elementwise_impl as ti import dpctl.utils as dpu import numpy -from dpctl.tensor._numpy_helper import normalize_axis_index import dpnp - -# pylint: disable=no-name-in-module import dpnp.backend.extensions.statistics._statistics_impl as statistics_ext -from dpnp.dpnp_utils.dpnp_utils_common import ( +import dpnp.tensor as dpt +import dpnp.tensor._tensor_elementwise_impl as ti + +from .dpnp_utils import get_usm_allocations +from .dpnp_utils.dpnp_utils_common import ( result_type_for_device, to_supported_dtypes, ) - -from .dpnp_utils import get_usm_allocations from .dpnp_utils.dpnp_utils_reduction import dpnp_wrap_reduction_call from .dpnp_utils.dpnp_utils_statistics import dpnp_cov, dpnp_median +from .tensor._numpy_helper import normalize_axis_index def _count_reduce_items(arr, axis, where=True): @@ -670,7 +670,7 @@ def _run_native_sliding_dot_product1d(a, v, l_pad, r_pad, rdtype): a_casted = dpnp.asarray(a, dtype=supported_dtype, order="C") v_casted = dpnp.asarray(v, dtype=supported_dtype, order="C") - usm_type = dpu.get_coerced_usm_type([a_casted.usm_type, v_casted.usm_type]) + usm_type = dpt.get_coerced_usm_type([a_casted.usm_type, v_casted.usm_type]) out_size = l_pad + r_pad + a_casted.size - v_casted.size + 1 # out type is the same as input type out = dpnp.empty_like(a_casted, shape=out_size, usm_type=usm_type) diff --git a/dpnp/dpnp_iface_trigonometric.py b/dpnp/dpnp_iface_trigonometric.py index a46f06c10e08..35428a0416e7 100644 --- a/dpnp/dpnp_iface_trigonometric.py +++ b/dpnp/dpnp_iface_trigonometric.py @@ -42,13 +42,11 @@ # pylint: disable=protected-access # pylint: disable=no-name-in-module - -import dpctl.tensor as dpt -import dpctl.tensor._tensor_elementwise_impl as ti -import dpctl.tensor._type_utils as dtu - import dpnp import dpnp.backend.extensions.ufunc._ufunc_impl as ufi +import dpnp.tensor as dpt +import dpnp.tensor._tensor_elementwise_impl as ti +import dpnp.tensor._type_utils as dtu from .dpnp_algo.dpnp_elementwise_common import DPNPBinaryFunc, DPNPUnaryFunc from .dpnp_utils.dpnp_utils_reduction import dpnp_wrap_reduction_call diff --git a/dpnp/dpnp_iface_types.py b/dpnp/dpnp_iface_types.py index 8fdb9e1d3d38..d3b295289831 100644 --- a/dpnp/dpnp_iface_types.py +++ b/dpnp/dpnp_iface_types.py @@ -37,10 +37,10 @@ import functools import dpctl -import dpctl.tensor as dpt import numpy import dpnp +import dpnp.tensor as dpt from .dpnp_array import dpnp_array diff --git a/dpnp/dpnp_iface_window.py b/dpnp/dpnp_iface_window.py index f8d6df07443d..bc12e714663c 100644 --- a/dpnp/dpnp_iface_window.py +++ b/dpnp/dpnp_iface_window.py @@ -111,7 +111,7 @@ def bartlett(M, *, device=None, usm_type=None, sycl_queue=None): `device` can be ``None``, a oneAPI filter selector string, an instance of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL device, an instance of :class:`dpctl.SyclQueue`, or a - :class:`dpctl.tensor.Device` object returned by + :class:`dpnp.tensor.Device` object returned by :attr:`dpnp.ndarray.device`. Default: ``None``. @@ -205,7 +205,7 @@ def blackman(M, *, device=None, usm_type=None, sycl_queue=None): `device` can be ``None``, a oneAPI filter selector string, an instance of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL device, an instance of :class:`dpctl.SyclQueue`, or a - :class:`dpctl.tensor.Device` object returned by + :class:`dpnp.tensor.Device` object returned by :attr:`dpnp.ndarray.device`. Default: ``None``. @@ -296,7 +296,7 @@ def hamming(M, *, device=None, usm_type=None, sycl_queue=None): `device` can be ``None``, a oneAPI filter selector string, an instance of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL device, an instance of :class:`dpctl.SyclQueue`, or a - :class:`dpctl.tensor.Device` object returned by + :class:`dpnp.tensor.Device` object returned by :attr:`dpnp.ndarray.device`. Default: ``None``. @@ -380,7 +380,7 @@ def hanning(M, *, device=None, usm_type=None, sycl_queue=None): `device` can be ``None``, a oneAPI filter selector string, an instance of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL device, an instance of :class:`dpctl.SyclQueue`, or a - :class:`dpctl.tensor.Device` object returned by + :class:`dpnp.tensor.Device` object returned by :attr:`dpnp.ndarray.device`. Default: ``None``. @@ -466,7 +466,7 @@ def kaiser(M, beta, *, device=None, usm_type=None, sycl_queue=None): `device` can be ``None``, a oneAPI filter selector string, an instance of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL device, an instance of :class:`dpctl.SyclQueue`, or a - :class:`dpctl.tensor.Device` object returned by + :class:`dpnp.tensor.Device` object returned by :attr:`dpnp.ndarray.device`. Default: ``None``. diff --git a/dpnp/dpnp_utils/dpnp_algo_utils.pyx b/dpnp/dpnp_utils/dpnp_algo_utils.pyx index 6ef9c9c28a12..00f40a0358e8 100644 --- a/dpnp/dpnp_utils/dpnp_algo_utils.pyx +++ b/dpnp/dpnp_utils/dpnp_algo_utils.pyx @@ -36,13 +36,13 @@ This module contains different helpers and utilities """ import dpctl -import dpctl.utils as dpu import numpy import dpnp import dpnp.config as config import dpnp.dpnp_container as dpnp_container from dpnp.dpnp_array import dpnp_array +from dpnp.tensor import get_coerced_usm_type, get_execution_queue cimport cpython cimport cython @@ -153,7 +153,7 @@ def call_origin(function, *args, **kwargs): kwargx = convert_item(kwarg) kwargs_new[key] = kwargx - exec_q = dpu.get_execution_queue(alloc_queues) + exec_q = get_execution_queue(alloc_queues) if exec_q is None: exec_q = dpnp.get_normalized_queue_device(sycl_queue=sycl_queue) # print(f"DPNP call_origin(): backend called. \n\t function={function}, \n\t args_new={args_new}, \n\t kwargs_new={kwargs_new}, \n\t dpnp_inplace={dpnp_inplace}") @@ -221,7 +221,7 @@ def _get_coerced_usm_type(objects): elif len(types_in_use) == 1: return types_in_use[0] - common_usm_type = dpu.get_coerced_usm_type(types_in_use) + common_usm_type = get_coerced_usm_type(types_in_use) if common_usm_type is None: raise ValueError("Input arrays must have coerced USM types") return common_usm_type @@ -234,7 +234,7 @@ def _get_common_allocation_queue(objects): elif len(queues_in_use) == 1: return queues_in_use[0] - common_queue = dpu.get_execution_queue(queues_in_use) + common_queue = get_execution_queue(queues_in_use) if common_queue is None: raise ValueError("Input arrays must be allocated on the same SYCL queue") return common_queue @@ -401,13 +401,13 @@ cdef tuple get_common_usm_allocation(dpnp_descriptor x1, dpnp_descriptor x2): array1_obj = x1.get_array() array2_obj = x2.get_array() - common_usm_type = dpctl.utils.get_coerced_usm_type((array1_obj.usm_type, array2_obj.usm_type)) + common_usm_type = get_coerced_usm_type((array1_obj.usm_type, array2_obj.usm_type)) if common_usm_type is None: raise ValueError( "could not recognize common USM type for inputs of USM types {} and {}" "".format(array1_obj.usm_type, array2_obj.usm_type)) - common_sycl_queue = dpu.get_execution_queue((array1_obj.sycl_queue, array2_obj.sycl_queue)) + common_sycl_queue = get_execution_queue((array1_obj.sycl_queue, array2_obj.sycl_queue)) if common_sycl_queue is None: raise ValueError( "could not recognize common SYCL queue for inputs in SYCL queues {} and {}" @@ -532,13 +532,13 @@ cdef class dpnp_descriptor: return self.origin_pyobj def get_array(self): - if isinstance(self.origin_pyobj, dpctl.tensor.usm_ndarray): + if isinstance(self.origin_pyobj, dpnp.tensor.usm_ndarray): return self.origin_pyobj if isinstance(self.origin_pyobj, dpnp_array): return self.origin_pyobj.get_array() raise TypeError( - "expected either dpctl.tensor.usm_ndarray or dpnp.dpnp_array.dpnp_array, got {}" + "expected either dpnp.tensor.usm_ndarray or dpnp.dpnp_array.dpnp_array, got {}" "".format(type(self.origin_pyobj))) cdef void * get_data(self): diff --git a/dpnp/dpnp_utils/dpnp_utils_common.py b/dpnp/dpnp_utils/dpnp_utils_common.py index e4bde2e1ec86..55d0f57ca1e2 100644 --- a/dpnp/dpnp_utils/dpnp_utils_common.py +++ b/dpnp/dpnp_utils/dpnp_utils_common.py @@ -29,9 +29,8 @@ from collections.abc import Iterable -import dpctl.tensor._type_utils as dtu - import dpnp +import dpnp.tensor._type_utils as dtu from dpnp.dpnp_utils import map_dtype_to_device __all__ = [ diff --git a/dpnp/dpnp_utils/dpnp_utils_einsum.py b/dpnp/dpnp_utils/dpnp_utils_einsum.py index 4a1a58635989..b954e3f99467 100644 --- a/dpnp/dpnp_utils/dpnp_utils_einsum.py +++ b/dpnp/dpnp_utils/dpnp_utils_einsum.py @@ -31,7 +31,6 @@ import operator import warnings -import dpctl import numpy import dpnp @@ -1023,7 +1022,7 @@ def dpnp_einsum( res_usm_type, exec_q = get_usm_allocations(arrays) if out is not None: dpnp.check_supported_arrays_type(out) - if dpctl.utils.get_execution_queue((exec_q, out.sycl_queue)) is None: + if dpnp.tensor.get_execution_queue((exec_q, out.sycl_queue)) is None: raise ExecutionPlacementError( "Input and output allocation queues are not compatible" ) diff --git a/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py b/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py index d2a1cdfbac46..2331eb7a10cc 100644 --- a/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py +++ b/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py @@ -26,21 +26,22 @@ # THE POSSIBILITY OF SUCH DAMAGE. # ***************************************************************************** -import dpctl -import dpctl.tensor as dpt -import dpctl.tensor._tensor_impl as ti import dpctl.utils as dpu import numpy -from dpctl.tensor._numpy_helper import ( - normalize_axis_index, - normalize_axis_tuple, -) import dpnp import dpnp.backend.extensions.blas._blas_impl as bi + +# pylint: disable=no-name-in-module +import dpnp.tensor as dpt +import dpnp.tensor._tensor_impl as ti from dpnp.dpnp_array import dpnp_array from dpnp.dpnp_utils import get_usm_allocations from dpnp.exceptions import AxisError, ExecutionPlacementError +from dpnp.tensor._numpy_helper import ( + normalize_axis_index, + normalize_axis_tuple, +) __all__ = [ "dpnp_cross", @@ -692,7 +693,7 @@ def _validate_out_array(out, exec_q): """Validate out is supported array and has correct queue.""" if out is not None: dpnp.check_supported_arrays_type(out) - if dpctl.utils.get_execution_queue((exec_q, out.sycl_queue)) is None: + if dpt.get_execution_queue((exec_q, out.sycl_queue)) is None: raise ExecutionPlacementError( "Input and output allocation queues are not compatible" ) @@ -769,7 +770,7 @@ def dpnp_dot(a, b, /, out=None, *, casting="same_kind", conjugate=False): The routine that is used to perform the main calculation depends on input arrays data type: 1) For integer and boolean data types, - `dpctl.tensor.vecdot` form the Data Parallel Control library is used, + `dpnp.tensor.vecdot` form the Data Parallel Control library is used, 2) For real-valued floating point data types, `dot` routines from BLAS library of OneMKL are used, and 3) For complex data types, `dotu` or `dotc` routines from BLAS library of OneMKL are used. @@ -817,7 +818,7 @@ def dpnp_dot(a, b, /, out=None, *, casting="same_kind", conjugate=False): _manager.add_event_pair(ht_ev, dot_ev) else: # oneapi::mkl::blas::dot does not support integer dtypes, - # so using dpctl.tensor.vecdot instead + # so using dpnp.tensor.vecdot instead a_usm = dpnp.get_usm_ndarray(a) b_usm = dpnp.get_usm_ndarray(b) result = dpnp_array._create_from_usm_ndarray(dpt.vecdot(a_usm, b_usm)) @@ -1116,7 +1117,7 @@ def dpnp_multiplication( else: # oneapi::mkl::blas::gemm/gemv do not support integer dtypes, # except for special cases determined in `_gemm_special_case`, - # use dpctl.tensor.matmul for unsupported cases + # use dpnp.tensor.matmul for unsupported cases # `dpt.matmul` does not support `casting` kwarg. # We may need to change input dtypes based on given `casting`. diff --git a/dpnp/dpnp_utils/dpnp_utils_reduction.py b/dpnp/dpnp_utils/dpnp_utils_reduction.py index 8c13c6380870..ba9830bd7eff 100644 --- a/dpnp/dpnp_utils/dpnp_utils_reduction.py +++ b/dpnp/dpnp_utils/dpnp_utils_reduction.py @@ -33,7 +33,7 @@ def dpnp_wrap_reduction_call(usm_a, out, _reduction_fn, res_dt, **kwargs): - """Wrap a reduction call from dpctl.tensor interface.""" + """Wrap a reduction call from dpnp.tensor interface.""" input_out = out if out is None: diff --git a/dpnp/dpnp_utils/dpnp_utils_statistics.py b/dpnp/dpnp_utils/dpnp_utils_statistics.py index c8414b661851..ac62ddcc2766 100644 --- a/dpnp/dpnp_utils/dpnp_utils_statistics.py +++ b/dpnp/dpnp_utils/dpnp_utils_statistics.py @@ -28,13 +28,11 @@ import warnings -import dpctl -import dpctl.tensor as dpt -from dpctl.tensor._numpy_helper import normalize_axis_tuple - import dpnp +import dpnp.tensor as dpt from dpnp.dpnp_array import dpnp_array from dpnp.exceptions import ExecutionPlacementError +from dpnp.tensor._numpy_helper import normalize_axis_tuple __all__ = ["dpnp_cov", "dpnp_median"] @@ -67,7 +65,7 @@ def _calc_nanmedian(a, out=None): res = dpnp.empty_like(valid_counts, dtype=a.dtype) else: dpnp.check_supported_arrays_type(out) - exec_q = dpctl.utils.get_execution_queue((a.sycl_queue, out.sycl_queue)) + exec_q = dpt.get_execution_queue((a.sycl_queue, out.sycl_queue)) if exec_q is None: raise ExecutionPlacementError( "Input and output allocation queues are not compatible" diff --git a/dpnp/exceptions/__init__.py b/dpnp/exceptions/__init__.py index 26d78a853f41..99587311cf0d 100644 --- a/dpnp/exceptions/__init__.py +++ b/dpnp/exceptions/__init__.py @@ -32,10 +32,11 @@ SyclQueueCreationError, ) from dpctl.memory import USMAllocationError -from dpctl.tensor._dlpack import DLPackCreationError -from dpctl.utils import ExecutionPlacementError from numpy.exceptions import AxisError +from dpnp.tensor import ExecutionPlacementError +from dpnp.tensor._dlpack import DLPackCreationError + __all__ = [ "AxisError", "DLPackCreationError", diff --git a/dpnp/fft/dpnp_iface_fft.py b/dpnp/fft/dpnp_iface_fft.py index fcc222640c9a..90e1a112bdaf 100644 --- a/dpnp/fft/dpnp_iface_fft.py +++ b/dpnp/fft/dpnp_iface_fft.py @@ -263,7 +263,7 @@ def fftfreq( `device` can be ``None``, a oneAPI filter selector string, an instance of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL device, an instance of :class:`dpctl.SyclQueue`, or a - :class:`dpctl.tensor.Device` object returned by + :class:`dpnp.tensor.Device` object returned by :attr:`dpnp.ndarray.device`. Default: ``None``. @@ -1581,7 +1581,7 @@ def rfftfreq( `device` can be ``None``, a oneAPI filter selector string, an instance of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL device, an instance of :class:`dpctl.SyclQueue`, or a - :class:`dpctl.tensor.Device` object returned by + :class:`dpnp.tensor.Device` object returned by :attr:`dpnp.ndarray.device`. Default: ``None``. diff --git a/dpnp/fft/dpnp_utils_fft.py b/dpnp/fft/dpnp_utils_fft.py index 28032b9d3be2..733436ab9887 100644 --- a/dpnp/fft/dpnp_utils_fft.py +++ b/dpnp/fft/dpnp_utils_fft.py @@ -41,18 +41,18 @@ from collections.abc import Sequence -import dpctl -import dpctl.tensor._tensor_impl as ti import dpctl.utils as dpu import numpy -from dpctl.tensor._numpy_helper import ( - normalize_axis_index, - normalize_axis_tuple, -) import dpnp import dpnp.backend.extensions.fft._fft_impl as fi +import dpnp.tensor._tensor_impl as ti from dpnp.exceptions import ExecutionPlacementError +from dpnp.tensor import get_execution_queue +from dpnp.tensor._numpy_helper import ( + normalize_axis_index, + normalize_axis_tuple, +) from ..dpnp_array import dpnp_array from ..dpnp_utils import map_dtype_to_device @@ -196,8 +196,8 @@ def _compute_result(dsc, a, out, forward, c2c, out_strides): out_usm = None if out is None else dpnp.get_usm_ndarray(out) if ( out is not None - and out_usm.strides == tuple(out_strides) - and not ti._array_overlap(a_usm, out_usm) + and out.strides == tuple(out_strides) + and not ti._array_overlap(a_usm, dpnp.get_usm_ndarray(out)) ): res_usm = out_usm result = out @@ -546,10 +546,7 @@ def _validate_out_keyword(a, out, s, axes, c2c, c2r, r2c): """Validate out keyword argument.""" if out is not None: dpnp.check_supported_arrays_type(out) - if ( - dpctl.utils.get_execution_queue((a.sycl_queue, out.sycl_queue)) - is None - ): + if get_execution_queue((a.sycl_queue, out.sycl_queue)) is None: raise ExecutionPlacementError( "Input and output allocation queues are not compatible" ) @@ -779,7 +776,7 @@ def dpnp_fillfreq(a, m, n, val): """Fill an array with the sample frequencies""" exec_q = a.sycl_queue - _manager = dpctl.utils.SequentialOrderManager[exec_q] + _manager = dpu.SequentialOrderManager[exec_q] # it's assumed there are no dependent events to populate the array ht_lin_ev, lin_ev = ti._linspace_step(0, 1, a[:m].get_array(), exec_q) diff --git a/dpnp/linalg/dpnp_iface_linalg.py b/dpnp/linalg/dpnp_iface_linalg.py index 6959565ecf17..625d387667ac 100644 --- a/dpnp/linalg/dpnp_iface_linalg.py +++ b/dpnp/linalg/dpnp_iface_linalg.py @@ -45,10 +45,10 @@ from typing import NamedTuple import numpy -from dpctl.tensor._numpy_helper import normalize_axis_tuple import dpnp from dpnp.backend.extensions.lapack._lapack_impl import LinAlgError +from dpnp.tensor._numpy_helper import normalize_axis_tuple from .dpnp_utils_linalg import ( assert_2d, diff --git a/dpnp/linalg/dpnp_utils_linalg.py b/dpnp/linalg/dpnp_utils_linalg.py index 6881c7787e9f..cf6d1ff231f2 100644 --- a/dpnp/linalg/dpnp_utils_linalg.py +++ b/dpnp/linalg/dpnp_utils_linalg.py @@ -42,15 +42,17 @@ from typing import NamedTuple -import dpctl.tensor._tensor_impl as ti import dpctl.utils as dpu import numpy -from dpctl.tensor._numpy_helper import normalize_axis_index from numpy import prod import dpnp import dpnp.backend.extensions.lapack._lapack_impl as li + +# pylint: disable=no-name-in-module +import dpnp.tensor._tensor_impl as ti from dpnp.dpnp_utils import get_usm_allocations +from dpnp.tensor._numpy_helper import normalize_axis_index # pylint:disable=missing-class-docstring @@ -1262,7 +1264,7 @@ def _real_type(dtype, device=None): type is created. `device` can be ``None``, a oneAPI filter selector string, an instance of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL device, an instance of :class:`dpctl.SyclQueue`, - or a :class:`dpctl.tensor.Device` object returned by + or a :class:`dpnp.tensor.Device` object returned by :attr:`dpnp.ndarray.device`. Default: ``None``. diff --git a/dpnp/memory/_memory.py b/dpnp/memory/_memory.py index 70d93c04d6a5..ee0188d33b39 100644 --- a/dpnp/memory/_memory.py +++ b/dpnp/memory/_memory.py @@ -26,11 +26,12 @@ # THE POSSIBILITY OF SUCH DAMAGE. # ***************************************************************************** -import dpctl.tensor as dpt from dpctl.memory import MemoryUSMDevice as DPCTLMemoryUSMDevice from dpctl.memory import MemoryUSMHost as DPCTLMemoryUSMHost from dpctl.memory import MemoryUSMShared as DPCTLMemoryUSMShared +import dpnp.tensor as dpt + def _add_ptr_property(cls): _storage_attr = "_ptr" @@ -76,7 +77,7 @@ def create_data(x): Parameters ---------- x : usm_ndarray - Input array of :class:`dpctl.tensor.usm_ndarray` type. + Input array of :class:`dpnp.tensor.usm_ndarray` type. Returns ------- diff --git a/dpnp/random/dpnp_iface_random.py b/dpnp/random/dpnp_iface_random.py index 31a82fa5ac7b..3cafe12b1958 100644 --- a/dpnp/random/dpnp_iface_random.py +++ b/dpnp/random/dpnp_iface_random.py @@ -839,7 +839,7 @@ def normal( `device` can be ``None``, a oneAPI filter selector string, an instance of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL device, an instance of :class:`dpctl.SyclQueue`, or a - :class:`dpctl.tensor.Device` object returned by + :class:`dpnp.tensor.Device` object returned by :attr:`dpnp.ndarray.device`. Default: ``None``. @@ -1100,7 +1100,7 @@ def rand(*args, device=None, usm_type="device", sycl_queue=None): `device` can be ``None``, a oneAPI filter selector string, an instance of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL device, an instance of :class:`dpctl.SyclQueue`, or a - :class:`dpctl.tensor.Device` object returned by + :class:`dpnp.tensor.Device` object returned by :attr:`dpnp.ndarray.device`. Default: ``None``. @@ -1161,7 +1161,7 @@ def randint( `device` can be ``None``, a oneAPI filter selector string, an instance of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL device, an instance of :class:`dpctl.SyclQueue`, or a - :class:`dpctl.tensor.Device` object returned by + :class:`dpnp.tensor.Device` object returned by :attr:`dpnp.ndarray.device`. Default: ``None``. @@ -1222,7 +1222,7 @@ def randn(d0, *dn, device=None, usm_type="device", sycl_queue=None): `device` can be ``None``, a oneAPI filter selector string, an instance of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL device, an instance of :class:`dpctl.SyclQueue`, or a - :class:`dpctl.tensor.Device` object returned by + :class:`dpnp.tensor.Device` object returned by :attr:`dpnp.ndarray.device`. Default: ``None``. @@ -1277,7 +1277,7 @@ def random(size=None, device=None, usm_type="device", sycl_queue=None): `device` can be ``None``, a oneAPI filter selector string, an instance of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL device, an instance of :class:`dpctl.SyclQueue`, or a - :class:`dpctl.tensor.Device` object returned by + :class:`dpnp.tensor.Device` object returned by :attr:`dpnp.ndarray.device`. Default: ``None``. @@ -1328,7 +1328,7 @@ def random_integers( `device` can be ``None``, a oneAPI filter selector string, an instance of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL device, an instance of :class:`dpctl.SyclQueue`, or a - :class:`dpctl.tensor.Device` object returned by + :class:`dpnp.tensor.Device` object returned by :attr:`dpnp.ndarray.device`. Default: ``None``. @@ -1396,7 +1396,7 @@ def random_sample(size=None, device=None, usm_type="device", sycl_queue=None): `device` can be ``None``, a oneAPI filter selector string, an instance of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL device, an instance of :class:`dpctl.SyclQueue`, or a - :class:`dpctl.tensor.Device` object returned by + :class:`dpnp.tensor.Device` object returned by :attr:`dpnp.ndarray.device`. Default: ``None``. @@ -1446,7 +1446,7 @@ def ranf(size=None, device=None, usm_type="device", sycl_queue=None): `device` can be ``None``, a oneAPI filter selector string, an instance of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL device, an instance of :class:`dpctl.SyclQueue`, or a - :class:`dpctl.tensor.Device` object returned by + :class:`dpnp.tensor.Device` object returned by :attr:`dpnp.ndarray.device`. Default: ``None``. @@ -1537,7 +1537,7 @@ def sample(size=None, device=None, usm_type="device", sycl_queue=None): `device` can be ``None``, a oneAPI filter selector string, an instance of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL device, an instance of :class:`dpctl.SyclQueue`, or a - :class:`dpctl.tensor.Device` object returned by + :class:`dpnp.tensor.Device` object returned by :attr:`dpnp.ndarray.device`. Default: ``None``. @@ -1616,7 +1616,7 @@ def seed(seed=None, device=None, sycl_queue=None): `device` can be ``None``, a oneAPI filter selector string, an instance of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL device, an instance of :class:`dpctl.SyclQueue`, or a - :class:`dpctl.tensor.Device` object returned by + :class:`dpnp.tensor.Device` object returned by :attr:`dpnp.ndarray.device`. Default: ``None``. @@ -1777,7 +1777,7 @@ def standard_normal(size=None, device=None, usm_type="device", sycl_queue=None): `device` can be ``None``, a oneAPI filter selector string, an instance of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL device, an instance of :class:`dpctl.SyclQueue`, or a - :class:`dpctl.tensor.Device` object returned by + :class:`dpnp.tensor.Device` object returned by :attr:`dpnp.ndarray.device`. Default: ``None``. @@ -1922,7 +1922,7 @@ def uniform( `device` can be ``None``, a oneAPI filter selector string, an instance of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL device, an instance of :class:`dpctl.SyclQueue`, or a - :class:`dpctl.tensor.Device` object returned by + :class:`dpnp.tensor.Device` object returned by :attr:`dpnp.ndarray.device`. Default: ``None``. diff --git a/dpnp/random/dpnp_random_state.py b/dpnp/random/dpnp_random_state.py index e49fe739aedd..9456169ec114 100644 --- a/dpnp/random/dpnp_random_state.py +++ b/dpnp/random/dpnp_random_state.py @@ -36,7 +36,6 @@ """ -import dpctl.utils as dpu import numpy import dpnp @@ -46,6 +45,7 @@ use_origin_backend, ) from dpnp.random.dpnp_algo_random import MCG59, MT19937 +from dpnp.tensor import validate_usm_type class RandomState: @@ -65,7 +65,7 @@ class RandomState: `device` can be ``None``, a oneAPI filter selector string, an instance of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL device, an instance of :class:`dpctl.SyclQueue`, or a - :class:`dpctl.tensor.Device` object returned by + :class:`dpnp.tensor.Device` object returned by :attr:`dpnp.ndarray.device`. Default: ``None``. @@ -269,7 +269,7 @@ def normal( f"scale={scale}, but must be non-negative." ) - dpu.validate_usm_type(usm_type, allow_none=False) + validate_usm_type(usm_type, allow_none=False) return self._random_state.normal( loc=loc, scale=scale, @@ -635,7 +635,7 @@ def uniform( dtype = self._validate_float_dtype( dtype, (dpnp.int32, dpnp.float32, dpnp.float64) ) - dpu.validate_usm_type(usm_type, allow_none=False) + validate_usm_type(usm_type, allow_none=False) return self._random_state.uniform( low=low, diff --git a/dpnp/scipy/linalg/_utils.py b/dpnp/scipy/linalg/_utils.py index d083f1c2c0a2..4fe2b9fb32a3 100644 --- a/dpnp/scipy/linalg/_utils.py +++ b/dpnp/scipy/linalg/_utils.py @@ -43,11 +43,13 @@ from warnings import warn -import dpctl.tensor._tensor_impl as ti import dpctl.utils as dpu import dpnp import dpnp.backend.extensions.lapack._lapack_impl as li + +# pylint: disable=no-name-in-module +import dpnp.tensor._tensor_impl as ti from dpnp.dpnp_utils import get_usm_allocations from dpnp.linalg.dpnp_utils_linalg import _common_type, _real_type diff --git a/dpnp/tensor/CMakeLists.txt b/dpnp/tensor/CMakeLists.txt new file mode 100644 index 000000000000..d0fe57cade64 --- /dev/null +++ b/dpnp/tensor/CMakeLists.txt @@ -0,0 +1,394 @@ +# -*- coding: utf-8 -*- +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +find_package(Python COMPONENTS Development.Module) + +# Tensor-specific flags + +# dpctl doesn't add -fsycl globally +# only to pybind11 module sources via add_sycl_to_target() +string(REPLACE "-fsycl " "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") + +# Use LLD linker (dpctl sets this at root level) +if(UNIX) + add_link_options("-fuse-ld=lld") +endif() + +# Remove global coverage flags for tensor +# use link-time only approach like dpctl +if(DPNP_GENERATE_COVERAGE) + string(REPLACE "-fprofile-instr-generate " "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") + string(REPLACE "-fcoverage-mapping " "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") + string(REPLACE "-fno-sycl-use-footer " "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") +endif() + +# Tensor-specific debug flags +# Disable device code debug info for Debug and Coverage builds to speed up linking +if( + CMAKE_BUILD_TYPE STREQUAL "Debug" + OR CMAKE_BUILD_TYPE STREQUAL "DEBUG" + OR CMAKE_BUILD_TYPE STREQUAL "Coverage" +) + if(WIN32) + add_compile_options(-Xsycl-target-frontend=spir64 "-g0") + elseif(UNIX) + add_compile_options(-Xsycl-target-frontend=spir64 "-g0") + if(CMAKE_BUILD_TYPE STREQUAL "Debug" OR CMAKE_BUILD_TYPE STREQUAL "DEBUG") + string(REPLACE "-g1" "-g" CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG}") + string(REPLACE "-g1" "-g" CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG}") + endif() + endif() +endif() + +# Match dpctl warning flags +# Suppress unused parameter warnings +add_compile_options(-Wno-unused-parameter) + +file(GLOB _cython_sources *.pyx) +foreach(_cy_file ${_cython_sources}) + get_filename_component(_trgt ${_cy_file} NAME_WLE) + build_dpnp_tensor_ext(${_trgt} ${_cy_file} "dpnp/tensor" RELATIVE_PATH "..") + target_include_directories(${_trgt} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include) +endforeach() + +if(WIN32) + if(${CMAKE_VERSION} VERSION_LESS "3.27") + # this is a work-around for target_link_options inserting option after -link option, cause + # linker to ignore it. + set(CMAKE_CXX_LINK_FLAGS + "${CMAKE_CXX_LINK_FLAGS} -fsycl-device-code-split=per_kernel" + ) + endif() +endif() + +# TODO: reuse this library for dpnp ufunc extension build +set(_static_lib_sources + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/simplify_iteration_space.cpp +) +set(_tensor_impl_sources + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/tensor_ctors.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/accumulators.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_and_cast_usm_to_usm.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_as_contig.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_for_reshape.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_for_roll.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/linear_sequences.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/integer_advanced_indexing.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/boolean_advanced_indexing.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/eye_ctor.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/full_ctor.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/zeros_ctor.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/triul_ctor.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/where.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/device_support_queries.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/repeat.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/clip.cpp +) +set(_accumulator_sources + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/accumulators/accumulators_common.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/accumulators/cumulative_logsumexp.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/accumulators/cumulative_prod.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/accumulators/cumulative_sum.cpp +) +set(_elementwise_sources + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/elementwise_common.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/elementwise_functions_type_utils.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/abs.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/acos.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/acosh.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/add.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/angle.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/asin.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/asinh.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/atan.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/atan2.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/atanh.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/bitwise_and.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/bitwise_invert.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/bitwise_left_shift.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/bitwise_or.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/bitwise_right_shift.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/bitwise_xor.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/cbrt.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/ceil.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/conj.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/copysign.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/cos.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/cosh.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/equal.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/exp.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/exp2.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/expm1.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/floor_divide.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/floor.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/greater_equal.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/greater.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/hypot.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/imag.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/isfinite.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/isinf.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/isnan.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/less_equal.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/less.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/log.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/log1p.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/log2.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/log10.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/logaddexp.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/logical_and.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/logical_not.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/logical_or.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/logical_xor.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/maximum.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/minimum.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/multiply.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/negative.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/nextafter.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/not_equal.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/positive.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/pow.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/proj.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/real.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/reciprocal.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/remainder.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/round.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/rsqrt.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/sign.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/signbit.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/sin.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/sinh.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/sqrt.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/square.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/subtract.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/tan.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/tanh.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/true_divide.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/trunc.cpp +) +set(_reduction_sources + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/reduction_common.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/all.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/any.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/argmax.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/argmin.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/logsumexp.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/max.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/min.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/prod.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/reduce_hypot.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/sum.cpp +) +set(_sorting_sources + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/isin.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/merge_sort.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/merge_argsort.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/radix_sort.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/radix_argsort.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/searchsorted.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/topk.cpp +) +set(_linalg_sources + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/elementwise_functions_type_utils.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/linalg_functions/dot.cpp +) +set(_tensor_accumulation_impl_sources + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/tensor_accumulation.cpp + ${_accumulator_sources} +) +set(_tensor_elementwise_impl_sources + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/tensor_elementwise.cpp + ${_elementwise_sources} +) +set(_tensor_reductions_impl_sources + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/tensor_reductions.cpp + ${_reduction_sources} +) +set(_tensor_sorting_impl_sources + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/tensor_sorting.cpp + ${_sorting_sources} +) +set(_tensor_linalg_impl_sources + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/tensor_linalg.cpp + ${_linalg_sources} +) + +set(_static_lib_trgt simplify_iteration_space) + +add_library(${_static_lib_trgt} STATIC ${_static_lib_sources}) +target_include_directories( + ${_static_lib_trgt} + PRIVATE + # ${Python_INCLUDE_DIRS} + # ${Dpctl_INCLUDE_DIR} + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/include +) +target_link_libraries(${_static_lib_trgt} PRIVATE pybind11::headers Python::Module) +set_target_properties(${_static_lib_trgt} PROPERTIES POSITION_INDEPENDENT_CODE ON) + +set(_py_trgts) + +set(python_module_name _tensor_impl) +pybind11_add_module(${python_module_name} MODULE ${_tensor_impl_sources}) +add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_tensor_impl_sources}) +target_link_libraries(${python_module_name} PRIVATE ${_static_lib_trgt}) +list(APPEND _py_trgts ${python_module_name}) + +set(python_module_name _tensor_accumulation_impl) +pybind11_add_module(${python_module_name} MODULE ${_tensor_accumulation_impl_sources}) +add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_tensor_accumulation_impl_sources}) +target_link_libraries(${python_module_name} PRIVATE ${_static_lib_trgt}) +list(APPEND _py_trgts ${python_module_name}) + +set(python_module_name _tensor_elementwise_impl) +pybind11_add_module(${python_module_name} MODULE ${_tensor_elementwise_impl_sources}) +add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_tensor_elementwise_impl_sources}) +target_link_libraries(${python_module_name} PRIVATE ${_static_lib_trgt}) +list(APPEND _py_trgts ${python_module_name}) + +set(python_module_name _tensor_reductions_impl) +pybind11_add_module(${python_module_name} MODULE ${_tensor_reductions_impl_sources}) +add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_tensor_reductions_impl_sources}) +target_link_libraries(${python_module_name} PRIVATE ${_static_lib_trgt}) +list(APPEND _py_trgts ${python_module_name}) + +set(python_module_name _tensor_sorting_impl) +pybind11_add_module(${python_module_name} MODULE ${_tensor_sorting_impl_sources}) +add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_tensor_sorting_impl_sources}) +target_link_libraries(${python_module_name} PRIVATE ${_static_lib_trgt}) +list(APPEND _py_trgts ${python_module_name}) + +set(python_module_name _tensor_linalg_impl) +pybind11_add_module(${python_module_name} MODULE ${_tensor_linalg_impl_sources}) +add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_tensor_linalg_impl_sources}) +target_link_libraries(${python_module_name} PRIVATE ${_static_lib_trgt}) +list(APPEND _py_trgts ${python_module_name}) + +set(_clang_prefix "") +if(WIN32) + set(_clang_prefix "/clang:") +endif() + +set(_no_fast_math_sources + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_and_cast_usm_to_usm.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/full_ctor.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/linear_sequences.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/clip.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/where.cpp +) +list( + APPEND _no_fast_math_sources + ${_elementwise_sources} + ${_reduction_sources} + ${_sorting_sources} + ${_linalg_sources} + ${_accumulator_sources} +) + +foreach(_src_fn ${_no_fast_math_sources}) + get_source_file_property(_cmpl_options_prop ${_src_fn} COMPILE_OPTIONS) + set(_combined_options_prop ${_cmpl_options_prop} "${_clang_prefix}-fno-fast-math") + set_source_files_properties( + ${_src_fn} + PROPERTIES COMPILE_OPTIONS "${_combined_options_prop}" + ) +endforeach() + +set(_compiler_definitions "") + +foreach(_src_fn ${_elementwise_sources}) + get_source_file_property(_cmpl_options_defs ${_src_fn} COMPILE_DEFINITIONS) + if(${_cmpl_options_defs}) + set(_combined_options_defs ${_cmpl_options_defs} "${_compiler_definitions}") + else() + set(_combined_options_defs "${_compiler_definitions}") + endif() + set_source_files_properties( + ${_src_fn} + PROPERTIES COMPILE_DEFINITIONS "${_combined_options_defs}" + ) +endforeach() + +set(_linker_options "LINKER:${DPNP_LDFLAGS}") +foreach(python_module_name ${_py_trgts}) + target_compile_options( + ${python_module_name} + PRIVATE -fno-sycl-id-queries-fit-in-int + ) + target_link_options( + ${python_module_name} + PRIVATE -fsycl-device-code-split=per_kernel + ) + if(DPNP_TENSOR_OFFLOAD_COMPRESS) + target_link_options(${python_module_name} PRIVATE --offload-compress) + endif() + + target_include_directories( + ${python_module_name} + PRIVATE + ${CMAKE_SOURCE_DIR}/dpnp/backend/include + ${Dpctl_INCLUDE_DIR} + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/include + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/ + ${CMAKE_BINARY_DIR} # For generated Cython headers + ) + target_link_options(${python_module_name} PRIVATE ${_linker_options}) + if(DPNP_GENERATE_COVERAGE) + if(DPNP_TENSOR_GENERATE_COVERAGE_FOR_PYBIND11_EXTENSIONS) + target_compile_options( + ${python_module_name} + PRIVATE -fprofile-instr-generate -fcoverage-mapping + ) + endif() + target_link_options( + ${python_module_name} + PRIVATE -fprofile-instr-generate -fcoverage-mapping + ) + endif() + if(_dpnp_sycl_targets) + # make fat binary + target_compile_options( + ${python_module_name} + PRIVATE ${_dpnp_sycl_target_compile_options} + ) + target_link_options( + ${python_module_name} + PRIVATE ${_dpnp_sycl_target_link_options} + ) + endif() + # Ensure Cython modules build first so _usmarray.h exists + add_dependencies(${python_module_name} _usmarray) + if(DPNP_WITH_REDIST) + set_target_properties( + ${python_module_name} + PROPERTIES INSTALL_RPATH "$ORIGIN/../../../.." + ) + endif() + install(TARGETS ${python_module_name} DESTINATION "dpnp/tensor") +endforeach() diff --git a/dpnp/tensor/__init__.pxd b/dpnp/tensor/__init__.pxd new file mode 100644 index 000000000000..a4bcecfec1d1 --- /dev/null +++ b/dpnp/tensor/__init__.pxd @@ -0,0 +1,36 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +""" This file declares the extension types and functions for the Cython API + implemented in _usmarray.pyx file. +""" + +# distutils: language = c++ +# cython: language_level=3 + +from ._usmarray cimport * diff --git a/dpnp/tensor/__init__.py b/dpnp/tensor/__init__.py new file mode 100644 index 000000000000..0118e04f7ab1 --- /dev/null +++ b/dpnp/tensor/__init__.py @@ -0,0 +1,425 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + + +from ._accumulation import cumulative_logsumexp, cumulative_prod, cumulative_sum +from ._array_api import __array_api_version__, __array_namespace_info__ +from ._clip import clip +from ._compute_follows_data import ( + ExecutionPlacementError, + get_coerced_usm_type, + get_execution_queue, + validate_usm_type, +) +from ._constants import e, inf, nan, newaxis, pi +from ._copy_utils import ( + asnumpy, + astype, + copy, + from_numpy, + to_numpy, +) +from ._ctors import ( + arange, + asarray, + empty, + empty_like, + eye, + full, + full_like, + linspace, + meshgrid, + ones, + ones_like, + tril, + triu, + zeros, + zeros_like, +) +from ._data_types import ( + bool, + complex64, + complex128, + dtype, + float16, + float32, + float64, + int8, + int16, + int32, + int64, + uint8, + uint16, + uint32, + uint64, +) +from ._device import Device +from ._dldevice_conversions import ( + dldevice_to_sycl_device, + sycl_device_to_dldevice, +) +from ._dlpack import from_dlpack +from ._elementwise_funcs import ( + abs, + acos, + acosh, + add, + angle, + asin, + asinh, + atan, + atan2, + atanh, + bitwise_and, + bitwise_invert, + bitwise_left_shift, + bitwise_or, + bitwise_right_shift, + bitwise_xor, + cbrt, + ceil, + conj, + copysign, + cos, + cosh, + divide, + equal, + exp, + exp2, + expm1, + floor, + floor_divide, + greater, + greater_equal, + hypot, + imag, + isfinite, + isinf, + isnan, + less, + less_equal, + log, + log1p, + log2, + log10, + logaddexp, + logical_and, + logical_not, + logical_or, + logical_xor, + maximum, + minimum, + multiply, + negative, + nextafter, + not_equal, + positive, + pow, + proj, + real, + reciprocal, + remainder, + round, + rsqrt, + sign, + signbit, + sin, + sinh, + sqrt, + square, + subtract, + tan, + tanh, + trunc, +) +from ._indexing_functions import ( + extract, + nonzero, + place, + put, + put_along_axis, + take, + take_along_axis, +) +from ._linear_algebra_functions import ( + matmul, + matrix_transpose, + tensordot, + vecdot, +) +from ._manipulation_functions import ( + broadcast_arrays, + broadcast_to, + concat, + expand_dims, + flip, + moveaxis, + permute_dims, + repeat, + roll, + squeeze, + stack, + swapaxes, + tile, + unstack, +) +from ._print import ( + get_print_options, + print_options, + set_print_options, + usm_ndarray_repr, + usm_ndarray_str, +) +from ._reduction import ( + argmax, + argmin, + count_nonzero, + logsumexp, + max, + min, + prod, + reduce_hypot, + sum, +) + +# isort: off +# placed here to avoid circular import +from ._usmarray import DLDeviceType, usm_ndarray + +# isort: on +from ._reshape import reshape +from ._search_functions import where +from ._searchsorted import searchsorted +from ._set_functions import ( + isin, + unique_all, + unique_counts, + unique_inverse, + unique_values, +) +from ._sorting import argsort, sort, top_k +from ._statistical_functions import mean, std, var +from ._testing import allclose +from ._type_utils import can_cast, finfo, iinfo, isdtype, result_type +from ._utility_functions import all, any, diff + +__all__ = [ + "Device", + "DLDeviceType", + "usm_ndarray", + # data types + "bool", + "dtype", + "int8", + "uint8", + "int16", + "uint16", + "int32", + "uint32", + "int64", + "uint64", + "float16", + "float32", + "float64", + "complex64", + "complex128", + # constants + "e", + "inf", + "nan", + "newaxis", + "pi", + # functions + "abs", + "acos", + "acosh", + "add", + "all", + "allclose", + "angle", + "any", + "arange", + "argmax", + "argmin", + "argsort", + "asarray", + "asin", + "asinh", + "asnumpy", + "astype", + "atan", + "atanh", + "atan2", + "bitwise_and", + "bitwise_invert", + "bitwise_left_shift", + "bitwise_or", + "bitwise_right_shift", + "bitwise_xor", + "broadcast_arrays", + "broadcast_to", + "can_cast", + "cbrt", + "ceil", + "concat", + "conj", + "copy", + "copysign", + "cos", + "cosh", + "count_nonzero", + "clip", + "cumulative_logsumexp", + "cumulative_prod", + "cumulative_sum", + "diff", + "divide", + "dldevice_to_sycl_device", + "empty", + "empty_like", + "equal", + "extract", + "expand_dims", + "eye", + "exp", + "exp2", + "expm1", + "finfo", + "flip", + "floor", + "floor_divide", + "from_dlpack", + "from_numpy", + "full", + "full_like", + "get_print_options", + "greater", + "greater_equal", + "hypot", + "iinfo", + "imag", + "isfinite", + "isinf", + "isdtype", + "isin", + "isnan", + "less", + "less_equal", + "linspace", + "log", + "logaddexp", + "logical_and", + "logical_not", + "logical_or", + "logical_xor", + "logsumexp", + "log1p", + "log2", + "log10", + "max", + "maximum", + "mean", + "meshgrid", + "min", + "minimum", + "moveaxis", + "multiply", + "permute_dims", + "matmul", + "matrix_transpose", + "negative", + "nextafter", + "nonzero", + "not_equal", + "ones", + "ones_like", + "place", + "positive", + "pow", + "print_options", + "prod", + "proj", + "put", + "put_along_axis", + "real", + "reciprocal", + "reduce_hypot", + "remainder", + "repeat", + "reshape", + "result_type", + "roll", + "round", + "rsqrt", + "searchsorted", + "set_print_options", + "sign", + "signbit", + "sin", + "sinh", + "sort", + "sqrt", + "square", + "squeeze", + "stack", + "std", + "subtract", + "sum", + "swapaxes", + "sycl_device_to_dldevice", + "take", + "take_along_axis", + "tan", + "tanh", + "tensordot", + "tile", + "top_k", + "to_numpy", + "tril", + "triu", + "trunc", + "unique_all", + "unique_counts", + "unique_inverse", + "unique_values", + "unstack", + "usm_ndarray_repr", + "usm_ndarray_str", + "var", + "vecdot", + "where", + "zeros", + "zeros_like", + "__array_api_version__", + "__array_namespace_info__", + # utilities + "ExecutionPlacementError", + "get_coerced_usm_type", + "get_execution_queue", + "validate_usm_type", +] diff --git a/dpnp/tensor/_accumulation.py b/dpnp/tensor/_accumulation.py new file mode 100644 index 000000000000..069eb870f783 --- /dev/null +++ b/dpnp/tensor/_accumulation.py @@ -0,0 +1,466 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +from dpctl.utils import SequentialOrderManager + +import dpnp.tensor as dpt +import dpnp.tensor._tensor_accumulation_impl as tai +import dpnp.tensor._tensor_impl as ti + +from ._numpy_helper import normalize_axis_index +from ._type_utils import ( + _default_accumulation_dtype, + _default_accumulation_dtype_fp_types, + _to_device_supported_dtype, +) + + +def _accumulate_common( + x, + axis, + dtype, + include_initial, + out, + _accumulate_fn, + _accumulate_include_initial_fn, + _dtype_supported, + _default_accumulation_type_fn, +): + if not isinstance(x, dpt.usm_ndarray): + raise TypeError(f"Expected dpnp.tensor.usm_ndarray, got {type(x)}") + appended_axis = False + if x.ndim == 0: + x = x[dpt.newaxis] + appended_axis = True + nd = x.ndim + if axis is None: + if nd > 1: + raise ValueError( + "`axis` cannot be `None` for array of dimension `{}`".format(nd) + ) + axis = 0 + else: + axis = normalize_axis_index(axis, nd, "axis") + sh = x.shape + res_sh = ( + sh[:axis] + (sh[axis] + 1,) + sh[axis + 1 :] if include_initial else sh + ) + a1 = axis + 1 + if a1 == nd: + perm = list(range(nd)) + arr = x + else: + perm = [i for i in range(nd) if i != axis] + [ + axis, + ] + arr = dpt.permute_dims(x, perm) + q = x.sycl_queue + inp_dt = x.dtype + res_usm_type = x.usm_type + if dtype is None: + res_dt = _default_accumulation_type_fn(inp_dt, q) + else: + res_dt = dpt.dtype(dtype) + res_dt = _to_device_supported_dtype(res_dt, q.sycl_device) + + # checking now avoids unnecessary allocations + implemented_types = _dtype_supported(inp_dt, res_dt) + if dtype is None and not implemented_types: + raise RuntimeError( + "Automatically determined accumulation data type does not " + "have direct implementation" + ) + orig_out = out + if out is not None: + if not isinstance(out, dpt.usm_ndarray): + raise TypeError( + f"output array must be of usm_ndarray type, got {type(out)}" + ) + if not out.flags.writable: + raise ValueError("provided `out` array is read-only") + out_sh = out.shape + # append an axis to `out` if scalar + if appended_axis and not include_initial: + out = out[dpt.newaxis, ...] + orig_out = out + final_res_sh = res_sh[1:] + else: + final_res_sh = res_sh + if not out_sh == final_res_sh: + raise ValueError( + "The shape of input and output arrays are inconsistent. " + f"Expected output shape is {final_res_sh}, got {out_sh}" + ) + if res_dt != out.dtype: + raise ValueError( + f"Output array of type {res_dt} is needed, " f"got {out.dtype}" + ) + if dpt.get_execution_queue((q, out.sycl_queue)) is None: + raise dpt.ExecutionPlacementError( + "Input and output allocation queues are not compatible" + ) + # permute out array dims if necessary + if a1 != nd: + out = dpt.permute_dims(out, perm) + orig_out = out + if ti._array_overlap(x, out) and implemented_types: + out = dpt.empty_like(out) + else: + out = dpt.empty( + res_sh, dtype=res_dt, usm_type=res_usm_type, sycl_queue=q + ) + if a1 != nd: + out = dpt.permute_dims(out, perm) + + _manager = SequentialOrderManager[q] + depends = _manager.submitted_events + if implemented_types: + if not include_initial: + ht_e, acc_ev = _accumulate_fn( + src=arr, + trailing_dims_to_accumulate=1, + dst=out, + sycl_queue=q, + depends=depends, + ) + else: + ht_e, acc_ev = _accumulate_include_initial_fn( + src=arr, dst=out, sycl_queue=q, depends=depends + ) + _manager.add_event_pair(ht_e, acc_ev) + if not (orig_out is None or out is orig_out): + # Copy the out data from temporary buffer to original memory + ht_e_cpy, cpy_e = ti._copy_usm_ndarray_into_usm_ndarray( + src=out, dst=orig_out, sycl_queue=q, depends=[acc_ev] + ) + _manager.add_event_pair(ht_e_cpy, cpy_e) + out = orig_out + else: + if _dtype_supported(res_dt, res_dt): + tmp = dpt.empty( + arr.shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=q + ) + ht_e_cpy, cpy_e = ti._copy_usm_ndarray_into_usm_ndarray( + src=arr, dst=tmp, sycl_queue=q, depends=depends + ) + _manager.add_event_pair(ht_e_cpy, cpy_e) + if not include_initial: + ht_e, acc_ev = _accumulate_fn( + src=tmp, + trailing_dims_to_accumulate=1, + dst=out, + sycl_queue=q, + depends=[cpy_e], + ) + else: + ht_e, acc_ev = _accumulate_include_initial_fn( + src=tmp, + dst=out, + sycl_queue=q, + depends=[cpy_e], + ) + _manager.add_event_pair(ht_e, acc_ev) + else: + buf_dt = _default_accumulation_type_fn(inp_dt, q) + tmp = dpt.empty( + arr.shape, dtype=buf_dt, usm_type=res_usm_type, sycl_queue=q + ) + ht_e_cpy, cpy_e = ti._copy_usm_ndarray_into_usm_ndarray( + src=arr, dst=tmp, sycl_queue=q, depends=depends + ) + _manager.add_event_pair(ht_e_cpy, cpy_e) + tmp_res = dpt.empty( + res_sh, dtype=buf_dt, usm_type=res_usm_type, sycl_queue=q + ) + if a1 != nd: + tmp_res = dpt.permute_dims(tmp_res, perm) + if not include_initial: + ht_e, acc_ev = _accumulate_fn( + src=tmp, + trailing_dims_to_accumulate=1, + dst=tmp_res, + sycl_queue=q, + depends=[cpy_e], + ) + else: + ht_e, acc_ev = _accumulate_include_initial_fn( + src=tmp, + dst=tmp_res, + sycl_queue=q, + depends=[cpy_e], + ) + _manager.add_event_pair(ht_e, acc_ev) + ht_e_cpy2, cpy_e2 = ti._copy_usm_ndarray_into_usm_ndarray( + src=tmp_res, dst=out, sycl_queue=q, depends=[acc_ev] + ) + _manager.add_event_pair(ht_e_cpy2, cpy_e2) + + if appended_axis: + out = dpt.squeeze(out) + if a1 != nd: + inv_perm = sorted(range(nd), key=lambda d: perm[d]) + out = dpt.permute_dims(out, inv_perm) + + return out + + +def cumulative_sum( + x, /, *, axis=None, dtype=None, include_initial=False, out=None +): + """ + cumulative_sum(x, /, *, axis=None, dtype=None, include_initial=False, + out=None) + + Calculates the cumulative sum of elements in the input array `x`. + + Args: + x (usm_ndarray): + input array. + axis (Optional[int]): + axis along which cumulative sum must be computed. + If `None`, the sum is computed over the entire array. + If `x` is a one-dimensional array, providing an `axis` is optional; + however, if `x` has more than one dimension, providing an `axis` + is required. + Default: `None`. + dtype (Optional[dtype]): + data type of the returned array. If `None`, the default data + type is inferred from the "kind" of the input array data type. + + * If `x` has a real- or complex-valued floating-point data + type, the returned array will have the same data type as + `x`. + * If `x` has signed integral data type, the returned array + will have the default signed integral type for the device + where input array `x` is allocated. + * If `x` has unsigned integral data type, the returned array + will have the default unsigned integral type for the device + where input array `x` is allocated. + * If `x` has a boolean data type, the returned array will + have the default signed integral type for the device + where input array `x` is allocated. + + If the data type (either specified or resolved) differs from the + data type of `x`, the input array elements are cast to the + specified data type before computing the cumulative sum. + Default: `None`. + include_initial (bool): + boolean indicating whether to include the initial value (i.e., the + additive identity, zero) as the first value along the provided axis + in the output. Default: `False`. + out (Optional[usm_ndarray]): + the array into which the result is written. + The data type of `out` must match the expected shape and the + expected data type of the result or (if provided) `dtype`. + If `None` then a new array is returned. Default: `None`. + + Returns: + usm_ndarray: + an array containing cumulative sums. The returned array has the data + type as described in the `dtype` parameter description above. + + The returned array shape is determined as follows: + + * If `include_initial` is `False`, the returned array will + have the same shape as `x` + * If `include_initial` is `True`, the returned array will + have the same shape as `x` except the axis along which the + cumulative sum is calculated, which will have size `N+1` + + where `N` is the size of the axis the cumulative sums are computed + along. + """ + return _accumulate_common( + x, + axis, + dtype, + include_initial, + out, + tai._cumsum_over_axis, + tai._cumsum_final_axis_include_initial, + tai._cumsum_dtype_supported, + _default_accumulation_dtype, + ) + + +def cumulative_prod( + x, /, *, axis=None, dtype=None, include_initial=False, out=None +): + """ + cumulative_prod(x, /, *, axis=None, dtype=None, include_initial=False, + out=None) + + Calculates the cumulative product of elements in the input array `x`. + + Args: + x (usm_ndarray): + input array. + axis (Optional[int]): + axis along which cumulative product must be computed. + If `None`, the product is computed over the entire array. + If `x` is a one-dimensional array, providing an `axis` is optional; + however, if `x` has more than one dimension, providing an `axis` + is required. + Default: `None`. + dtype (Optional[dtype]): + data type of the returned array. If `None`, the default data + type is inferred from the "kind" of the input array data type. + + * If `x` has a real- or complex-valued floating-point data + type, the returned array will have the same data type as + `x`. + * If `x` has signed integral data type, the returned array + will have the default signed integral type for the device + where input array `x` is allocated. + * If `x` has unsigned integral data type, the returned array + will have the default unsigned integral type for the device + where input array `x` is allocated. + * If `x` has a boolean data type, the returned array will + have the default signed integral type for the device + where input array `x` is allocated. + + If the data type (either specified or resolved) differs from the + data type of `x`, the input array elements are cast to the + specified data type before computing the cumulative product. + Default: `None`. + include_initial (bool): + boolean indicating whether to include the initial value (i.e., the + additive identity, zero) as the first value along the provided + axis in the output. Default: `False`. + out (Optional[usm_ndarray]): + the array into which the result is written. + The data type of `out` must match the expected shape and the + expected data type of the result or (if provided) `dtype`. + If `None` then a new array is returned. Default: `None`. + + Returns: + usm_ndarray: + an array containing cumulative products. The returned array has + the data type as described in the `dtype` parameter description + above. + + The returned array shape is determined as follows: + + * If `include_initial` is `False`, the returned array will + have the same shape as `x` + * If `include_initial` is `True`, the returned array will + have the same shape as `x` except the axis along which the + cumulative product is calculated, which will have size `N+1` + + where `N` is the size of the axis the cumulative products are + computed along. + """ + return _accumulate_common( + x, + axis, + dtype, + include_initial, + out, + tai._cumprod_over_axis, + tai._cumprod_final_axis_include_initial, + tai._cumprod_dtype_supported, + _default_accumulation_dtype, + ) + + +def cumulative_logsumexp( + x, /, *, axis=None, dtype=None, include_initial=False, out=None +): + """ + cumulative_logsumexp(x, /, *, axis=None, dtype=None, include_initial=False, + out=None) + + Calculates the cumulative logsmumexp of elements in the input array `x`. + + Args: + x (usm_ndarray): + input array. + axis (Optional[int]): + axis along which cumulative logsumexp must be computed. + If `None`, the logsumexp is computed over the entire array. + If `x` is a one-dimensional array, providing an `axis` is optional; + however, if `x` has more than one dimension, providing an `axis` + is required. + Default: `None`. + dtype (Optional[dtype]): + data type of the returned array. If `None`, the default data + type is inferred from the "kind" of the input array data type. + + * If `x` has a real- or complex-valued floating-point data + type, the returned array will have the same data type as + `x`. + * If `x` has signed integral data type, the returned array + will have the default signed integral type for the device + where input array `x` is allocated. + * If `x` has unsigned integral data type, the returned array + will have the default unsigned integral type for the device + where input array `x` is allocated. + * If `x` has a boolean data type, the returned array will + have the default signed integral type for the device + where input array `x` is allocated. + + If the data type (either specified or resolved) differs from the + data type of `x`, the input array elements are cast to the + specified data type before computing the cumulative logsumexp. + Default: `None`. + include_initial (bool): + boolean indicating whether to include the initial value (i.e., the + additive identity, zero) as the first value along the provided axis + in the output. Default: `False`. + out (Optional[usm_ndarray]): + the array into which the result is written. + The data type of `out` must match the expected shape and the + expected data type of the result or (if provided) `dtype`. + If `None` then a new array is returned. Default: `None`. + + Returns: + usm_ndarray: + an array containing cumulative logsumexp results. The returned + array has the data type as described in the `dtype` parameter + description above. + + The returned array shape is determined as follows: + + * If `include_initial` is `False`, the returned array will + have the same shape as `x` + * If `include_initial` is `True`, the returned array will + have the same shape as `x` except the axis along which the + cumulative logsumexp is calculated, which will have size + `N+1` + """ + return _accumulate_common( + x, + axis, + dtype, + include_initial, + out, + tai._cumlogsumexp_over_axis, + tai._cumlogsumexp_final_axis_include_initial, + tai._cumlogsumexp_dtype_supported, + _default_accumulation_dtype_fp_types, + ) diff --git a/dpnp/tensor/_array_api.py b/dpnp/tensor/_array_api.py new file mode 100644 index 000000000000..a18bc2be1824 --- /dev/null +++ b/dpnp/tensor/_array_api.py @@ -0,0 +1,254 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import dpctl + +import dpnp.tensor as dpt + +from ._tensor_impl import ( + default_device_complex_type, + default_device_fp_type, + default_device_index_type, + default_device_int_type, +) + + +def _isdtype_impl(dtype, kind): + if isinstance(kind, str): + if kind == "bool": + return dtype.kind == "b" + elif kind == "signed integer": + return dtype.kind == "i" + elif kind == "unsigned integer": + return dtype.kind == "u" + elif kind == "integral": + return dtype.kind in "iu" + elif kind == "real floating": + return dtype.kind == "f" + elif kind == "complex floating": + return dtype.kind == "c" + elif kind == "numeric": + return dtype.kind in "iufc" + else: + raise ValueError(f"Unrecognized data type kind: {kind}") + + elif isinstance(kind, tuple): + return any(_isdtype_impl(dtype, k) for k in kind) + else: + raise TypeError(f"Unsupported type for dtype kind: {type(kind)}") + + +def _get_device_impl(d): + if d is None: + return dpctl.select_default_device() + elif isinstance(d, dpctl.SyclDevice): + return d + elif isinstance(d, (dpt.Device, dpctl.SyclQueue)): + return d.sycl_device + else: + try: + return dpctl.SyclDevice(d) + except TypeError: + raise TypeError(f"Unsupported type for device argument: {type(d)}") + + +__array_api_version__ = "2024.12" + + +class Info: + """namespace returned by ``__array_namespace_info__()``""" + + def __init__(self): + self._capabilities = { + "boolean indexing": True, + "data-dependent shapes": True, + "max dimensions": None, + } + self._all_dtypes = { + "bool": dpt.bool, + "float32": dpt.float32, + "float64": dpt.float64, + "complex64": dpt.complex64, + "complex128": dpt.complex128, + "int8": dpt.int8, + "int16": dpt.int16, + "int32": dpt.int32, + "int64": dpt.int64, + "uint8": dpt.uint8, + "uint16": dpt.uint16, + "uint32": dpt.uint32, + "uint64": dpt.uint64, + } + + def capabilities(self): + """ + capabilities() + + Returns a dictionary of ``dpctl``'s capabilities. + + The dictionary contains the following keys: + ``"boolean indexing"``: + boolean indicating ``dpctl``'s support of boolean indexing. + Value: ``True`` + ``"data-dependent shapes"``: + boolean indicating ``dpctl``'s support of data-dependent shapes. + Value: ``True`` + ``max dimensions``: + integer indication the maximum array dimension supported by ``dpctl``. + Value: ``None`` + + Returns: + dict: + dictionary of ``dpctl``'s capabilities + """ + return self._capabilities.copy() + + def default_device(self): + """ + default_device() + + Returns the default SYCL device. + """ + return dpctl.select_default_device() + + def default_dtypes(self, *, device=None): + """ + default_dtypes(*, device=None) + + Returns a dictionary of default data types for ``device``. + + Args: + device (Optional[:class:`dpctl.SyclDevice`, :class:`dpctl.SyclQueue`, :class:`dpctl.tensor.Device`, str]): + array API concept of device used in getting default data types. + ``device`` can be ``None`` (in which case the default device + is used), an instance of :class:`dpctl.SyclDevice`, an instance + of :class:`dpctl.SyclQueue`, a :class:`dpctl.tensor.Device` + object returned by :attr:`dpctl.tensor.usm_ndarray.device`, or + a filter selector string. + Default: ``None``. + + Returns: + dict: + a dictionary of default data types for ``device``: + + - ``"real floating"``: dtype + - ``"complex floating"``: dtype + - ``"integral"``: dtype + - ``"indexing"``: dtype + """ + device = _get_device_impl(device) + return { + "real floating": dpt.dtype(default_device_fp_type(device)), + "complex floating": dpt.dtype(default_device_complex_type(device)), + "integral": dpt.dtype(default_device_int_type(device)), + "indexing": dpt.dtype(default_device_index_type(device)), + } + + def dtypes(self, *, device=None, kind=None): + """ + dtypes(*, device=None, kind=None) + + Returns a dictionary of all Array API data types of a specified + ``kind`` supported by ``device``. + + This dictionary only includes data types supported by the + `Python Array API `_ + specification. + + Args: + device (Optional[:class:`dpctl.SyclDevice`, :class:`dpctl.SyclQueue`, :class:`dpctl.tensor.Device`, str]): + array API concept of device used in getting default data types. + ``device`` can be ``None`` (in which case the default device is + used), an instance of :class:`dpctl.SyclDevice`, an instance of + :class:`dpctl.SyclQueue`, a :class:`dpctl.tensor.Device` + object returned by :attr:`dpctl.tensor.usm_ndarray.device`, or + a filter selector string. + Default: ``None``. + + kind (Optional[str, Tuple[str, ...]]): + data type kind. + + - if ``kind`` is ``None``, returns a dictionary of all data + types supported by `device` + - if ``kind`` is a string, returns a dictionary containing the + data types belonging to the data type kind specified. + + Supports: + + * ``"bool"`` + * ``"signed integer"`` + * ``"unsigned integer"`` + * ``"integral"`` + * ``"real floating"`` + * ``"complex floating"`` + * ``"numeric"`` + + - if ``kind`` is a tuple, the tuple represents a union of + ``kind`` strings, and returns a dictionary containing data + types corresponding to the-specified union. + + Default: ``None``. + + Returns: + dict: + a dictionary of the supported data types of the specified + ``kind`` + """ + device = _get_device_impl(device) + _fp64 = device.has_aspect_fp64 + if kind is None: + return { + key: val + for key, val in self._all_dtypes.items() + if _fp64 or (key != "float64" and key != "complex128") + } + else: + return { + key: val + for key, val in self._all_dtypes.items() + if (_fp64 or (key != "float64" and key != "complex128")) + and _isdtype_impl(val, kind) + } + + def devices(self): + """ + devices() + + Returns a list of supported devices. + """ + return dpctl.get_devices() + + +def __array_namespace_info__(): + """ + __array_namespace_info__() + + Returns a namespace with Array API namespace inspection utilities. + + """ + return Info() diff --git a/dpnp/tensor/_clip.py b/dpnp/tensor/_clip.py new file mode 100644 index 000000000000..44434fc0bb0c --- /dev/null +++ b/dpnp/tensor/_clip.py @@ -0,0 +1,771 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +from dpctl.utils import SequentialOrderManager + +import dpnp.tensor as dpt +import dpnp.tensor._tensor_elementwise_impl as tei +import dpnp.tensor._tensor_impl as ti + +from ._copy_utils import ( + _empty_like_orderK, + _empty_like_pair_orderK, + _empty_like_triple_orderK, +) +from ._manipulation_functions import _broadcast_shape_impl +from ._scalar_utils import ( + _get_dtype, + _get_queue_usm_type, + _get_shape, + _validate_dtype, +) +from ._type_utils import ( + _can_cast, + _resolve_one_strong_one_weak_types, + _resolve_one_strong_two_weak_types, +) + + +def _check_clip_dtypes(res_dtype, arg1_dtype, arg2_dtype, sycl_dev): + """ + Checks if both types `arg1_dtype` and `arg2_dtype` can be + cast to `res_dtype` according to the rule `safe` + """ + if arg1_dtype == res_dtype and arg2_dtype == res_dtype: + return None, None, res_dtype + + _fp16 = sycl_dev.has_aspect_fp16 + _fp64 = sycl_dev.has_aspect_fp64 + if _can_cast(arg1_dtype, res_dtype, _fp16, _fp64) and _can_cast( + arg2_dtype, res_dtype, _fp16, _fp64 + ): + # prevent unnecessary casting + ret_buf1_dt = None if res_dtype == arg1_dtype else res_dtype + ret_buf2_dt = None if res_dtype == arg2_dtype else res_dtype + return ret_buf1_dt, ret_buf2_dt, res_dtype + else: + return None, None, None + + +def _clip_none(x, val, out, order, _binary_fn): + q1, x_usm_type = x.sycl_queue, x.usm_type + q2, val_usm_type = _get_queue_usm_type(val) + if q2 is None: + exec_q = q1 + res_usm_type = x_usm_type + else: + exec_q = dpt.get_execution_queue((q1, q2)) + if exec_q is None: + raise dpt.ExecutionPlacementError( + "Execution placement can not be unambiguously inferred " + "from input arguments." + ) + res_usm_type = dpt.get_coerced_usm_type( + ( + x_usm_type, + val_usm_type, + ) + ) + dpt.validate_usm_type(res_usm_type, allow_none=False) + x_shape = x.shape + val_shape = _get_shape(val) + if not isinstance(val_shape, (tuple, list)): + raise TypeError( + "Shape of arguments can not be inferred. " + "Arguments are expected to be " + "lists, tuples, or both" + ) + try: + res_shape = _broadcast_shape_impl( + [ + x_shape, + val_shape, + ] + ) + except ValueError: + raise ValueError( + "operands could not be broadcast together with shapes " + f"{x_shape} and {val_shape}" + ) + sycl_dev = exec_q.sycl_device + x_dtype = x.dtype + val_dtype = _get_dtype(val, sycl_dev) + if not _validate_dtype(val_dtype): + raise ValueError("Operands have unsupported data types") + + val_dtype = _resolve_one_strong_one_weak_types(x_dtype, val_dtype, sycl_dev) + + res_dt = x.dtype + _fp16 = sycl_dev.has_aspect_fp16 + _fp64 = sycl_dev.has_aspect_fp64 + if not _can_cast(val_dtype, res_dt, _fp16, _fp64): + raise ValueError( + f"function 'clip' does not support input types " + f"({x_dtype}, {val_dtype}), " + "and the inputs could not be safely coerced to any " + "supported types according to the casting rule ''safe''." + ) + + orig_out = out + if out is not None: + if not isinstance(out, dpt.usm_ndarray): + raise TypeError( + f"output array must be of usm_ndarray type, got {type(out)}" + ) + + if not out.flags.writable: + raise ValueError("provided `out` array is read-only") + + if out.shape != res_shape: + raise ValueError( + "The shape of input and output arrays are inconsistent. " + f"Expected output shape is {res_shape}, got {out.shape}" + ) + + if res_dt != out.dtype: + raise ValueError( + f"Output array of type {res_dt} is needed, got {out.dtype}" + ) + + if dpt.get_execution_queue((exec_q, out.sycl_queue)) is None: + raise dpt.ExecutionPlacementError( + "Input and output allocation queues are not compatible" + ) + + if ti._array_overlap(x, out): + if not ti._same_logical_tensors(x, out): + out = dpt.empty_like(out) + + if isinstance(val, dpt.usm_ndarray): + if ( + ti._array_overlap(val, out) + and not ti._same_logical_tensors(val, out) + and val_dtype == res_dt + ): + out = dpt.empty_like(out) + + if isinstance(val, dpt.usm_ndarray): + val_ary = val + else: + val_ary = dpt.asarray(val, dtype=val_dtype, sycl_queue=exec_q) + + if order == "A": + order = ( + "F" + if all( + arr.flags.f_contiguous + for arr in ( + x, + val_ary, + ) + ) + else "C" + ) + if val_dtype == res_dt: + if out is None: + if order == "K": + out = _empty_like_pair_orderK( + x, val_ary, res_dt, res_shape, res_usm_type, exec_q + ) + else: + out = dpt.empty( + res_shape, + dtype=res_dt, + usm_type=res_usm_type, + sycl_queue=exec_q, + order=order, + ) + if x_shape != res_shape: + x = dpt.broadcast_to(x, res_shape) + if val_ary.shape != res_shape: + val_ary = dpt.broadcast_to(val_ary, res_shape) + _manager = SequentialOrderManager[exec_q] + dep_evs = _manager.submitted_events + ht_binary_ev, binary_ev = _binary_fn( + src1=x, src2=val_ary, dst=out, sycl_queue=exec_q, depends=dep_evs + ) + _manager.add_event_pair(ht_binary_ev, binary_ev) + if not (orig_out is None or orig_out is out): + # Copy the out data from temporary buffer to original memory + ht_copy_out_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=out, + dst=orig_out, + sycl_queue=exec_q, + depends=[binary_ev], + ) + _manager.add_event_pair(ht_copy_out_ev, copy_ev) + out = orig_out + return out + else: + if order == "K": + buf = _empty_like_orderK(val_ary, res_dt) + else: + buf = dpt.empty_like(val_ary, dtype=res_dt, order=order) + _manager = SequentialOrderManager[exec_q] + dep_evs = _manager.submitted_events + ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=val_ary, dst=buf, sycl_queue=exec_q, depends=dep_evs + ) + _manager.add_event_pair(ht_copy_ev, copy_ev) + if out is None: + if order == "K": + out = _empty_like_pair_orderK( + x, buf, res_dt, res_shape, res_usm_type, exec_q + ) + else: + out = dpt.empty( + res_shape, + dtype=res_dt, + usm_type=res_usm_type, + sycl_queue=exec_q, + order=order, + ) + + if x_shape != res_shape: + x = dpt.broadcast_to(x, res_shape) + buf = dpt.broadcast_to(buf, res_shape) + ht_binary_ev, binary_ev = _binary_fn( + src1=x, + src2=buf, + dst=out, + sycl_queue=exec_q, + depends=[copy_ev], + ) + _manager.add_event_pair(ht_binary_ev, binary_ev) + if not (orig_out is None or orig_out is out): + # Copy the out data from temporary buffer to original memory + ht_copy_out_ev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=out, + dst=orig_out, + sycl_queue=exec_q, + depends=[binary_ev], + ) + _manager.add_event_pair(ht_copy_out_ev, cpy_ev) + out = orig_out + return out + + +def clip(x, /, min=None, max=None, out=None, order="K"): + """clip(x, min=None, max=None, out=None, order="K") + + Clips to the range [`min_i`, `max_i`] for each element `x_i` + in `x`. + + Args: + x (usm_ndarray): Array containing elements to clip. + Must be compatible with `min` and `max` according + to broadcasting rules. + min ({None, Union[usm_ndarray, bool, int, float, complex]}, optional): + Array containing minimum values. + Must be compatible with `x` and `max` according + to broadcasting rules. + max ({None, Union[usm_ndarray, bool, int, float, complex]}, optional): + Array containing maximum values. + Must be compatible with `x` and `min` according + to broadcasting rules. + out ({None, usm_ndarray}, optional): + Output array to populate. + Array must have the correct shape and the expected data type. + order ("C","F","A","K", optional): + Memory layout of the newly output array, if parameter `out` is + `None`. + Default: "K". + + Returns: + usm_ndarray: + An array with elements clipped to the range [`min`, `max`]. + The returned array has the same data type as `x`. + """ + if not isinstance(x, dpt.usm_ndarray): + raise TypeError( + "Expected `x` to be of dpnp.tensor.usm_ndarray type, got " + f"{type(x)}" + ) + if order not in ["K", "C", "F", "A"]: + order = "K" + if x.dtype.kind in "iu": + if isinstance(min, int) and min <= dpt.iinfo(x.dtype).min: + min = None + if isinstance(max, int) and max >= dpt.iinfo(x.dtype).max: + max = None + if min is None and max is None: + exec_q = x.sycl_queue + orig_out = out + if out is not None: + if not isinstance(out, dpt.usm_ndarray): + raise TypeError( + "output array must be of usm_ndarray type, got " + f"{type(out)}" + ) + + if not out.flags.writable: + raise ValueError("provided `out` array is read-only") + + if out.shape != x.shape: + raise ValueError( + "The shape of input and output arrays are " + f"inconsistent. Expected output shape is {x.shape}, " + f"got {out.shape}" + ) + + if x.dtype != out.dtype: + raise ValueError( + f"Output array of type {x.dtype} is needed, " + f"got {out.dtype}" + ) + + if dpt.get_execution_queue((exec_q, out.sycl_queue)) is None: + raise dpt.ExecutionPlacementError( + "Input and output allocation queues are not compatible" + ) + + if ti._array_overlap(x, out): + if not ti._same_logical_tensors(x, out): + out = dpt.empty_like(out) + else: + return out + else: + if order == "K": + out = _empty_like_orderK(x, x.dtype) + else: + out = dpt.empty_like(x, order=order) + + _manager = SequentialOrderManager[exec_q] + dep_evs = _manager.submitted_events + ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=x, dst=out, sycl_queue=exec_q, depends=dep_evs + ) + _manager.add_event_pair(ht_copy_ev, copy_ev) + if not (orig_out is None or orig_out is out): + # Copy the out data from temporary buffer to original memory + ht_copy_out_ev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=out, + dst=orig_out, + sycl_queue=exec_q, + depends=[copy_ev], + ) + _manager.add_event_pair(ht_copy_ev, cpy_ev) + out = orig_out + return out + elif max is None: + return _clip_none(x, min, out, order, tei._maximum) + elif min is None: + return _clip_none(x, max, out, order, tei._minimum) + else: + q1, x_usm_type = x.sycl_queue, x.usm_type + q2, min_usm_type = _get_queue_usm_type(min) + q3, max_usm_type = _get_queue_usm_type(max) + if q2 is None and q3 is None: + exec_q = q1 + res_usm_type = x_usm_type + elif q3 is None: + exec_q = dpt.get_execution_queue((q1, q2)) + if exec_q is None: + raise dpt.ExecutionPlacementError( + "Execution placement can not be unambiguously inferred " + "from input arguments." + ) + res_usm_type = dpt.get_coerced_usm_type( + ( + x_usm_type, + min_usm_type, + ) + ) + elif q2 is None: + exec_q = dpt.get_execution_queue((q1, q3)) + if exec_q is None: + raise dpt.ExecutionPlacementError( + "Execution placement can not be unambiguously inferred " + "from input arguments." + ) + res_usm_type = dpt.get_coerced_usm_type( + ( + x_usm_type, + max_usm_type, + ) + ) + else: + exec_q = dpt.get_execution_queue((q1, q2, q3)) + if exec_q is None: + raise dpt.ExecutionPlacementError( + "Execution placement can not be unambiguously inferred " + "from input arguments." + ) + res_usm_type = dpt.get_coerced_usm_type( + ( + x_usm_type, + min_usm_type, + max_usm_type, + ) + ) + dpt.validate_usm_type(res_usm_type, allow_none=False) + x_shape = x.shape + min_shape = _get_shape(min) + max_shape = _get_shape(max) + if not all( + isinstance(s, (tuple, list)) + for s in ( + min_shape, + max_shape, + ) + ): + raise TypeError( + "Shape of arguments can not be inferred. " + "Arguments are expected to be " + "lists, tuples, or both" + ) + try: + res_shape = _broadcast_shape_impl( + [ + x_shape, + min_shape, + max_shape, + ] + ) + except ValueError: + raise ValueError( + "operands could not be broadcast together with shapes " + f"{x_shape}, {min_shape}, and {max_shape}" + ) + sycl_dev = exec_q.sycl_device + x_dtype = x.dtype + min_dtype = _get_dtype(min, sycl_dev) + max_dtype = _get_dtype(max, sycl_dev) + if not all(_validate_dtype(o) for o in (min_dtype, max_dtype)): + raise ValueError("Operands have unsupported data types") + + min_dtype, max_dtype = _resolve_one_strong_two_weak_types( + x_dtype, min_dtype, max_dtype, sycl_dev + ) + + buf1_dt, buf2_dt, res_dt = _check_clip_dtypes( + x_dtype, + min_dtype, + max_dtype, + sycl_dev, + ) + + if res_dt is None: + raise ValueError( + f"function '{clip}' does not support input types " + f"({x_dtype}, {min_dtype}, {max_dtype}), " + "and the inputs could not be safely coerced to any " + "supported types according to the casting rule ''safe''." + ) + + orig_out = out + if out is not None: + if not isinstance(out, dpt.usm_ndarray): + raise TypeError( + "output array must be of usm_ndarray type, got " + f"{type(out)}" + ) + + if not out.flags.writable: + raise ValueError("provided `out` array is read-only") + + if out.shape != res_shape: + raise ValueError( + "The shape of input and output arrays are " + f"inconsistent. Expected output shape is {res_shape}, " + f"got {out.shape}" + ) + + if res_dt != out.dtype: + raise ValueError( + f"Output array of type {res_dt} is needed, " + f"got {out.dtype}" + ) + + if dpt.get_execution_queue((exec_q, out.sycl_queue)) is None: + raise dpt.ExecutionPlacementError( + "Input and output allocation queues are not compatible" + ) + + if ti._array_overlap(x, out): + if not ti._same_logical_tensors(x, out): + out = dpt.empty_like(out) + + if isinstance(min, dpt.usm_ndarray): + if ( + ti._array_overlap(min, out) + and not ti._same_logical_tensors(min, out) + and buf1_dt is None + ): + out = dpt.empty_like(out) + + if isinstance(max, dpt.usm_ndarray): + if ( + ti._array_overlap(max, out) + and not ti._same_logical_tensors(max, out) + and buf2_dt is None + ): + out = dpt.empty_like(out) + + if isinstance(min, dpt.usm_ndarray): + a_min = min + else: + a_min = dpt.asarray(min, dtype=min_dtype, sycl_queue=exec_q) + if isinstance(max, dpt.usm_ndarray): + a_max = max + else: + a_max = dpt.asarray(max, dtype=max_dtype, sycl_queue=exec_q) + + if order == "A": + order = ( + "F" + if all( + arr.flags.f_contiguous + for arr in ( + x, + a_min, + a_max, + ) + ) + else "C" + ) + if buf1_dt is None and buf2_dt is None: + if out is None: + if order == "K": + out = _empty_like_triple_orderK( + x, + a_min, + a_max, + res_dt, + res_shape, + res_usm_type, + exec_q, + ) + else: + out = dpt.empty( + res_shape, + dtype=res_dt, + usm_type=res_usm_type, + sycl_queue=exec_q, + order=order, + ) + if x_shape != res_shape: + x = dpt.broadcast_to(x, res_shape) + if a_min.shape != res_shape: + a_min = dpt.broadcast_to(a_min, res_shape) + if a_max.shape != res_shape: + a_max = dpt.broadcast_to(a_max, res_shape) + _manager = SequentialOrderManager[exec_q] + dep_ev = _manager.submitted_events + ht_binary_ev, binary_ev = ti._clip( + src=x, + min=a_min, + max=a_max, + dst=out, + sycl_queue=exec_q, + depends=dep_ev, + ) + _manager.add_event_pair(ht_binary_ev, binary_ev) + if not (orig_out is None or orig_out is out): + # Copy the out data from temporary buffer to original memory + ht_copy_out_ev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=out, + dst=orig_out, + sycl_queue=exec_q, + depends=[binary_ev], + ) + _manager.add_event_pair(ht_copy_out_ev, cpy_ev) + out = orig_out + return out + + elif buf1_dt is None: + if order == "K": + buf2 = _empty_like_orderK(a_max, buf2_dt) + else: + buf2 = dpt.empty_like(a_max, dtype=buf2_dt, order=order) + _manager = SequentialOrderManager[exec_q] + dep_ev = _manager.submitted_events + ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=a_max, dst=buf2, sycl_queue=exec_q, depends=dep_ev + ) + _manager.add_event_pair(ht_copy_ev, copy_ev) + if out is None: + if order == "K": + out = _empty_like_triple_orderK( + x, + a_min, + buf2, + res_dt, + res_shape, + res_usm_type, + exec_q, + ) + else: + out = dpt.empty( + res_shape, + dtype=res_dt, + usm_type=res_usm_type, + sycl_queue=exec_q, + order=order, + ) + + x = dpt.broadcast_to(x, res_shape) + if a_min.shape != res_shape: + a_min = dpt.broadcast_to(a_min, res_shape) + buf2 = dpt.broadcast_to(buf2, res_shape) + ht_binary_ev, binary_ev = ti._clip( + src=x, + min=a_min, + max=buf2, + dst=out, + sycl_queue=exec_q, + depends=[copy_ev], + ) + _manager.add_event_pair(ht_binary_ev, binary_ev) + if not (orig_out is None or orig_out is out): + # Copy the out data from temporary buffer to original memory + ht_copy_out_ev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=out, + dst=orig_out, + sycl_queue=exec_q, + depends=[binary_ev], + ) + _manager.add_event_pair(ht_copy_out_ev, cpy_ev) + out = orig_out + return out + + elif buf2_dt is None: + if order == "K": + buf1 = _empty_like_orderK(a_min, buf1_dt) + else: + buf1 = dpt.empty_like(a_min, dtype=buf1_dt, order=order) + _manager = SequentialOrderManager[exec_q] + dep_ev = _manager.submitted_events + ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=a_min, dst=buf1, sycl_queue=exec_q, depends=dep_ev + ) + _manager.add_event_pair(ht_copy_ev, copy_ev) + if out is None: + if order == "K": + out = _empty_like_triple_orderK( + x, + buf1, + a_max, + res_dt, + res_shape, + res_usm_type, + exec_q, + ) + else: + out = dpt.empty( + res_shape, + dtype=res_dt, + usm_type=res_usm_type, + sycl_queue=exec_q, + order=order, + ) + + x = dpt.broadcast_to(x, res_shape) + buf1 = dpt.broadcast_to(buf1, res_shape) + if a_max.shape != res_shape: + a_max = dpt.broadcast_to(a_max, res_shape) + ht_binary_ev, binary_ev = ti._clip( + src=x, + min=buf1, + max=a_max, + dst=out, + sycl_queue=exec_q, + depends=[copy_ev], + ) + _manager.add_event_pair(ht_binary_ev, binary_ev) + if not (orig_out is None or orig_out is out): + # Copy the out data from temporary buffer to original memory + ht_copy_out_ev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=out, + dst=orig_out, + sycl_queue=exec_q, + depends=[binary_ev], + ) + _manager.add_event_pair(ht_copy_out_ev, cpy_ev) + out = orig_out + return out + + if order == "K": + if ( + x.flags.c_contiguous + and a_min.flags.c_contiguous + and a_max.flags.c_contiguous + ): + order = "C" + elif ( + x.flags.f_contiguous + and a_min.flags.f_contiguous + and a_max.flags.f_contiguous + ): + order = "F" + if order == "K": + buf1 = _empty_like_orderK(a_min, buf1_dt) + else: + buf1 = dpt.empty_like(a_min, dtype=buf1_dt, order=order) + + _manager = SequentialOrderManager[exec_q] + dep_evs = _manager.submitted_events + ht_copy1_ev, copy1_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=a_min, dst=buf1, sycl_queue=exec_q, depends=dep_evs + ) + _manager.add_event_pair(ht_copy1_ev, copy1_ev) + if order == "K": + buf2 = _empty_like_orderK(a_max, buf2_dt) + else: + buf2 = dpt.empty_like(a_max, dtype=buf2_dt, order=order) + ht_copy2_ev, copy2_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=a_max, dst=buf2, sycl_queue=exec_q, depends=dep_evs + ) + _manager.add_event_pair(ht_copy2_ev, copy2_ev) + if out is None: + if order == "K": + out = _empty_like_triple_orderK( + x, buf1, buf2, res_dt, res_shape, res_usm_type, exec_q + ) + else: + out = dpt.empty( + res_shape, + dtype=res_dt, + usm_type=res_usm_type, + sycl_queue=exec_q, + order=order, + ) + + x = dpt.broadcast_to(x, res_shape) + buf1 = dpt.broadcast_to(buf1, res_shape) + buf2 = dpt.broadcast_to(buf2, res_shape) + ht_, clip_ev = ti._clip( + src=x, + min=buf1, + max=buf2, + dst=out, + sycl_queue=exec_q, + depends=[copy1_ev, copy2_ev], + ) + _manager.add_event_pair(ht_, clip_ev) + return out diff --git a/dpnp/tensor/_compute_follows_data.pyx b/dpnp/tensor/_compute_follows_data.pyx new file mode 100644 index 000000000000..70e6bdfaeb79 --- /dev/null +++ b/dpnp/tensor/_compute_follows_data.pyx @@ -0,0 +1,191 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +# distutils: language = c++ +# cython: language_level=3 +# cython: linetrace=True + +"""Compute-follows-data utilities for execution queue and USM type management. + +This module provides utilities to determine execution placement and USM allocation +types when combining arrays under the compute-follows-data paradigm. +""" + + +import dpctl +from dpctl._sycl_queue cimport SyclQueue + +__all__ = [ + "get_execution_queue", "get_coerced_usm_type", "ExecutionPlacementError" +] + + +class ExecutionPlacementError(Exception): + """Exception raised when execution placement target can not + be unambiguously determined from input arrays. + + Make sure that input arrays are associated with the same + :class:`dpctl.SyclQueue`, + or migrate data to the same :class:`dpctl.SyclQueue` using + :meth:`dpctl.tensor.usm_ndarray.to_device` method. + """ + pass + + +cdef bint queue_equiv(SyclQueue q1, SyclQueue q2): + """Queues are equivalent if ``q1 == q2``, that is they are copies + of the same underlying SYCL object and hence are the same.""" + return q1.__eq__(q2) + + +def get_execution_queue(qs, /): + """ + Get execution queue from queues associated with input arrays. + + Args: + qs (List[:class:`dpctl.SyclQueue`], Tuple[:class:`dpctl.SyclQueue`]): + a list or a tuple of :class:`dpctl.SyclQueue` objects + corresponding to arrays that are being combined. + + Returns: + SyclQueue: + execution queue under compute follows data paradigm, + or ``None`` if queues are not equal. + """ + if not isinstance(qs, (list, tuple)): + raise TypeError( + "Expected a list or a tuple, got {}".format(type(qs)) + ) + if len(qs) == 0: + return None + elif len(qs) == 1: + return qs[0] if isinstance(qs[0], dpctl.SyclQueue) else None + for q1, q2 in zip(qs[:-1], qs[1:]): + if not isinstance(q1, dpctl.SyclQueue): + return None + elif not isinstance(q2, dpctl.SyclQueue): + return None + elif not queue_equiv( q1, q2): + return None + return qs[0] + + +def get_coerced_usm_type(usm_types, /): + """ + Get USM type of the output array for a function combining + arrays of given USM types using compute-follows-data execution + model. + + Args: + usm_types (List[str], Tuple[str]): + a list or a tuple of strings of ``.usm_types`` attributes + for input arrays + + Returns: + str + type of USM allocation for the output arrays (s). + ``None`` if any of the input strings are not recognized. + """ + if not isinstance(usm_types, (list, tuple)): + raise TypeError( + "Expected a list or a tuple, got {}".format(type(usm_types)) + ) + if len(usm_types) == 0: + return None + _k = ["device", "shared", "host"] + _m = {k: i for i, k in enumerate(_k)} + res = len(_k) + for t in usm_types: + if not isinstance(t, str): + return None + if t not in _m: + return None + res = min(res, _m[t]) + return _k[res] + + +def _validate_usm_type_allow_none(usm_type): + "Validates usm_type argument" + if usm_type is not None: + if isinstance(usm_type, str): + if usm_type not in ["device", "shared", "host"]: + raise ValueError( + f"Unrecognized value of usm_type={usm_type}, " + "expected 'device', 'shared', 'host', or None." + ) + else: + raise TypeError( + f"Expected usm_type to be a str or None, got {type(usm_type)}" + ) + + +def _validate_usm_type_disallow_none(usm_type): + "Validates usm_type argument" + if isinstance(usm_type, str): + if usm_type not in ["device", "shared", "host"]: + raise ValueError( + f"Unrecognized value of usm_type={usm_type}, " + "expected 'device', 'shared', or 'host'." + ) + else: + raise TypeError( + f"Expected usm_type to be a str, got {type(usm_type)}" + ) + + +def validate_usm_type(usm_type, /, *, allow_none=True): + """ validate_usm_type(usm_type, allow_none=True) + + Raises an exception if `usm_type` is invalid. + + Args: + usm_type: + Specification for USM allocation type. Valid specifications + are: + + * ``"device"`` + * ``"shared"`` + * ``"host"`` + + If ``allow_none`` keyword argument is set, a value of + ``None`` is also permitted. + allow_none (bool, optional): + Whether ``usm_type`` value of ``None`` is considered valid. + Default: `True`. + + Raises: + ValueError: + if ``usm_type`` is not a recognized string. + TypeError: + if ``usm_type`` is not a string, and ``usm_type`` is + not ``None`` provided ``allow_none`` is ``True``. + """ + if allow_none: + _validate_usm_type_allow_none(usm_type) + else: + _validate_usm_type_disallow_none(usm_type) diff --git a/dpnp/tensor/_constants.py b/dpnp/tensor/_constants.py new file mode 100644 index 000000000000..4c134bd9d375 --- /dev/null +++ b/dpnp/tensor/_constants.py @@ -0,0 +1,36 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import numpy as np + +newaxis = None + +pi = np.pi +e = np.e +nan = np.nan +inf = np.inf diff --git a/dpnp/tensor/_copy_utils.py b/dpnp/tensor/_copy_utils.py new file mode 100644 index 000000000000..3978e7345b12 --- /dev/null +++ b/dpnp/tensor/_copy_utils.py @@ -0,0 +1,1160 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import builtins +import operator +from numbers import Integral + +import dpctl +import dpctl.memory as dpm +import numpy as np +from dpctl.utils import SequentialOrderManager + +import dpnp.tensor as dpt +import dpnp.tensor._tensor_impl as ti + +from ._data_types import _get_dtype +from ._device import normalize_queue_device +from ._numpy_helper import normalize_axis_index +from ._type_utils import _dtype_supported_by_device_impl + +__doc__ = ( + "Implementation module for copy- and cast- operations on " + ":class:`dpctl.tensor.usm_ndarray`." +) + +int32_t_max = 1 + np.iinfo(np.int32).max + + +def _copy_to_numpy(ary): + if not isinstance(ary, dpt.usm_ndarray): + raise TypeError(f"Expected dpnp.tensor.usm_ndarray, got {type(ary)}") + if ary.size == 0: + # no data needs to be copied for zero sized array + return np.ndarray(ary.shape, dtype=ary.dtype) + nb = ary.usm_data.nbytes + q = ary.sycl_queue + hh = dpm.MemoryUSMHost(nb, queue=q) + h = np.ndarray(nb, dtype="u1", buffer=hh).view(ary.dtype) + itsz = ary.itemsize + strides_bytes = tuple(si * itsz for si in ary.strides) + offset = ary._element_offset * itsz + # ensure that content of ary.usm_data is final + q.wait() + hh.copy_from_device(ary.usm_data) + return np.ndarray( + ary.shape, + dtype=ary.dtype, + buffer=h, + strides=strides_bytes, + offset=offset, + ) + + +def _copy_from_numpy(np_ary, usm_type="device", sycl_queue=None): + """Copies numpy array `np_ary` into a new usm_ndarray""" + # This may perform a copy to meet stated requirements + Xnp = np.require(np_ary, requirements=["A", "E"]) + alloc_q = normalize_queue_device(sycl_queue=sycl_queue, device=None) + dt = Xnp.dtype + if dt.char in "dD" and alloc_q.sycl_device.has_aspect_fp64 is False: + Xusm_dtype = ( + dpt.dtype("float32") if dt.char == "d" else dpt.dtype("complex64") + ) + else: + Xusm_dtype = dt + Xusm = dpt.empty( + Xnp.shape, dtype=Xusm_dtype, usm_type=usm_type, sycl_queue=sycl_queue + ) + _copy_from_numpy_into(Xusm, Xnp) + return Xusm + + +def _copy_from_numpy_into(dst, np_ary): + """Copies `np_ary` into `dst` of type :class:`dpctl.tensor.usm_ndarray""" + if not isinstance(np_ary, np.ndarray): + raise TypeError(f"Expected numpy.ndarray, got {type(np_ary)}") + if not isinstance(dst, dpt.usm_ndarray): + raise TypeError(f"Expected usm_ndarray, got {type(dst)}") + if np_ary.flags["OWNDATA"]: + Xnp = np_ary + else: + # Determine base of input array + base = np_ary.base + while isinstance(base, np.ndarray): + base = base.base + if isinstance(base, dpm._memory._Memory): + # we must perform a copy, since subsequent + # _copy_numpy_ndarray_into_usm_ndarray is implemented using + # sycl::buffer, and using USM-pointers with sycl::buffer + # results is undefined behavior + Xnp = np_ary.copy() + else: + Xnp = np_ary + src_ary = np.broadcast_to(Xnp, dst.shape) + copy_q = dst.sycl_queue + if copy_q.sycl_device.has_aspect_fp64 is False: + src_ary_dt_c = src_ary.dtype.char + if src_ary_dt_c == "d": + src_ary = src_ary.astype(np.float32) + elif src_ary_dt_c == "D": + src_ary = src_ary.astype(np.complex64) + _manager = SequentialOrderManager[copy_q] + dep_ev = _manager.submitted_events + # synchronizing call + ti._copy_numpy_ndarray_into_usm_ndarray( + src=src_ary, dst=dst, sycl_queue=copy_q, depends=dep_ev + ) + + +def _extract_impl(ary, ary_mask, axis=0): + """ + Extract elements of ary by applying mask starting from slot + dimension axis + """ + if not isinstance(ary, dpt.usm_ndarray): + raise TypeError( + f"Expecting type dpnp.tensor.usm_ndarray, got {type(ary)}" + ) + if isinstance(ary_mask, dpt.usm_ndarray): + dst_usm_type = dpt.get_coerced_usm_type( + (ary.usm_type, ary_mask.usm_type) + ) + exec_q = dpt.get_execution_queue((ary.sycl_queue, ary_mask.sycl_queue)) + if exec_q is None: + raise dpt.ExecutionPlacementError( + "arrays have different associated queues. " + "Use `y.to_device(x.device)` to migrate." + ) + elif isinstance(ary_mask, np.ndarray): + dst_usm_type = ary.usm_type + exec_q = ary.sycl_queue + ary_mask = dpt.asarray( + ary_mask, usm_type=dst_usm_type, sycl_queue=exec_q + ) + else: + raise TypeError( + "Expecting type dpnp.tensor.usm_ndarray or numpy.ndarray, got " + f"{type(ary_mask)}" + ) + ary_nd = ary.ndim + pp = normalize_axis_index(operator.index(axis), ary_nd) + mask_nd = ary_mask.ndim + if pp < 0 or pp + mask_nd > ary_nd: + raise ValueError( + "Parameter p is inconsistent with input array dimensions" + ) + mask_nelems = ary_mask.size + cumsum_dt = dpt.int32 if mask_nelems < int32_t_max else dpt.int64 + cumsum = dpt.empty(mask_nelems, dtype=cumsum_dt, device=ary_mask.device) + exec_q = cumsum.sycl_queue + _manager = SequentialOrderManager[exec_q] + dep_evs = _manager.submitted_events + mask_count = ti.mask_positions( + ary_mask, cumsum, sycl_queue=exec_q, depends=dep_evs + ) + dst_shape = ary.shape[:pp] + (mask_count,) + ary.shape[pp + mask_nd :] + dst = dpt.empty( + dst_shape, dtype=ary.dtype, usm_type=dst_usm_type, device=ary.device + ) + if dst.size == 0: + return dst + hev, ev = ti._extract( + src=ary, + cumsum=cumsum, + axis_start=pp, + axis_end=pp + mask_nd, + dst=dst, + sycl_queue=exec_q, + depends=dep_evs, + ) + _manager.add_event_pair(hev, ev) + return dst + + +def _get_indices_queue_usm_type(inds, queue, usm_type): + """ + Utility for validating indices are NumPy ndarray or usm_ndarray of integral + dtype or Python integers. At least one must be an array. + + For each array, the queue and usm type are appended to `queue_list` and + `usm_type_list`, respectively. + """ + queues = [queue] + usm_types = [usm_type] + any_array = False + for ind in inds: + if isinstance(ind, (np.ndarray, dpt.usm_ndarray)): + any_array = True + if ind.dtype.kind not in "ui": + raise IndexError( + "arrays used as indices must be of integer (or boolean) " + "type" + ) + if isinstance(ind, dpt.usm_ndarray): + queues.append(ind.sycl_queue) + usm_types.append(ind.usm_type) + elif not isinstance(ind, Integral): + raise TypeError( + "all elements of `ind` expected to be usm_ndarrays, " + f"NumPy arrays, or integers, found {type(ind)}" + ) + if not any_array: + raise TypeError( + "at least one element of `inds` expected to be an array" + ) + usm_type = dpt.get_coerced_usm_type(usm_types) + q = dpt.get_execution_queue(queues) + return q, usm_type + + +def _nonzero_impl(ary): + if not isinstance(ary, dpt.usm_ndarray): + raise TypeError( + f"Expecting type dpnp.tensor.usm_ndarray, got {type(ary)}" + ) + exec_q = ary.sycl_queue + usm_type = ary.usm_type + mask_nelems = ary.size + cumsum_dt = dpt.int32 if mask_nelems < int32_t_max else dpt.int64 + cumsum = dpt.empty( + mask_nelems, dtype=cumsum_dt, sycl_queue=exec_q, order="C" + ) + _manager = SequentialOrderManager[exec_q] + dep_evs = _manager.submitted_events + mask_count = ti.mask_positions( + ary, cumsum, sycl_queue=exec_q, depends=dep_evs + ) + indexes_dt = ti.default_device_index_type(exec_q.sycl_device) + indexes = dpt.empty( + (ary.ndim, mask_count), + dtype=indexes_dt, + usm_type=usm_type, + sycl_queue=exec_q, + order="C", + ) + hev, nz_ev = ti._nonzero(cumsum, indexes, ary.shape, exec_q) + res = tuple(indexes[i, :] for i in range(ary.ndim)) + _manager.add_event_pair(hev, nz_ev) + return res + + +def _prepare_indices_arrays(inds, q, usm_type): + """ + Utility taking a mix of usm_ndarray and possibly Python int scalar indices, + a queue (assumed to be common to arrays in inds), and a usm type. + + Python scalar integers are promoted to arrays on the provided queue and + with the provided usm type. All arrays are then promoted to a common + integral type (if possible) before being broadcast to a common shape. + """ + # scalar integers -> arrays + inds = tuple( + map( + lambda ind: ( + ind + if isinstance(ind, dpt.usm_ndarray) + else dpt.asarray(ind, usm_type=usm_type, sycl_queue=q) + ), + inds, + ) + ) + + # promote to a common integral type if possible + ind_dt = dpt.result_type(*inds) + if ind_dt.kind not in "ui": + raise ValueError( + "cannot safely promote indices to an integer data type" + ) + inds = tuple( + map( + lambda ind: ( + ind if ind.dtype == ind_dt else dpt.astype(ind, ind_dt) + ), + inds, + ) + ) + + # broadcast + inds = dpt.broadcast_arrays(*inds) + + return inds + + +def _place_impl(ary, ary_mask, vals, axis=0): + """ + Extract elements of ary by applying mask starting from slot + dimension axis. + """ + if not isinstance(ary, dpt.usm_ndarray): + raise TypeError( + f"Expecting type dpnp.tensor.usm_ndarray, got {type(ary)}" + ) + if isinstance(ary_mask, dpt.usm_ndarray): + exec_q = dpt.get_execution_queue( + ( + ary.sycl_queue, + ary_mask.sycl_queue, + ) + ) + coerced_usm_type = dpt.get_coerced_usm_type( + ( + ary.usm_type, + ary_mask.usm_type, + ) + ) + if exec_q is None: + raise dpt.ExecutionPlacementError( + "arrays have different associated queues. " + "Use `y.to_device(x.device)` to migrate." + ) + elif isinstance(ary_mask, np.ndarray): + exec_q = ary.sycl_queue + coerced_usm_type = ary.usm_type + ary_mask = dpt.asarray( + ary_mask, usm_type=coerced_usm_type, sycl_queue=exec_q + ) + else: + raise TypeError( + "Expecting type dpnp.tensor.usm_ndarray or numpy.ndarray, got " + f"{type(ary_mask)}" + ) + if exec_q is not None: + if not isinstance(vals, dpt.usm_ndarray): + vals = dpt.asarray( + vals, + dtype=ary.dtype, + usm_type=coerced_usm_type, + sycl_queue=exec_q, + ) + else: + exec_q = dpt.get_execution_queue((exec_q, vals.sycl_queue)) + coerced_usm_type = dpt.get_coerced_usm_type( + ( + coerced_usm_type, + vals.usm_type, + ) + ) + if exec_q is None: + raise dpt.ExecutionPlacementError( + "arrays have different associated queues. " + "Use `Y.to_device(X.device)` to migrate." + ) + ary_nd = ary.ndim + pp = normalize_axis_index(operator.index(axis), ary_nd) + mask_nd = ary_mask.ndim + if pp < 0 or pp + mask_nd > ary_nd: + raise ValueError( + "Parameter p is inconsistent with input array dimensions" + ) + mask_nelems = ary_mask.size + cumsum_dt = dpt.int32 if mask_nelems < int32_t_max else dpt.int64 + cumsum = dpt.empty( + mask_nelems, + dtype=cumsum_dt, + usm_type=coerced_usm_type, + device=ary_mask.device, + ) + exec_q = cumsum.sycl_queue + _manager = SequentialOrderManager[exec_q] + dep_ev = _manager.submitted_events + mask_count = ti.mask_positions( + ary_mask, cumsum, sycl_queue=exec_q, depends=dep_ev + ) + expected_vals_shape = ( + ary.shape[:pp] + (mask_count,) + ary.shape[pp + mask_nd :] + ) + if vals.dtype == ary.dtype: + rhs = vals + else: + rhs = dpt.astype(vals, ary.dtype) + rhs = dpt.broadcast_to(rhs, expected_vals_shape) + if mask_nelems == 0: + return + dep_ev = _manager.submitted_events + hev, pl_ev = ti._place( + dst=ary, + cumsum=cumsum, + axis_start=pp, + axis_end=pp + mask_nd, + rhs=rhs, + sycl_queue=exec_q, + depends=dep_ev, + ) + _manager.add_event_pair(hev, pl_ev) + return + + +def _put_multi_index(ary, inds, p, vals, mode=0): + if not isinstance(ary, dpt.usm_ndarray): + raise TypeError( + f"Expecting type dpnp.tensor.usm_ndarray, got {type(ary)}" + ) + ary_nd = ary.ndim + p = normalize_axis_index(operator.index(p), ary_nd) + mode = operator.index(mode) + if mode not in [0, 1]: + raise ValueError( + "Invalid value for mode keyword, only 0 or 1 is supported" + ) + if not isinstance(inds, (list, tuple)): + inds = (inds,) + + exec_q, coerced_usm_type = _get_indices_queue_usm_type( + inds, ary.sycl_queue, ary.usm_type + ) + + if exec_q is not None: + if not isinstance(vals, dpt.usm_ndarray): + vals = dpt.asarray( + vals, + dtype=ary.dtype, + usm_type=coerced_usm_type, + sycl_queue=exec_q, + ) + else: + exec_q = dpt.get_execution_queue((exec_q, vals.sycl_queue)) + coerced_usm_type = dpt.get_coerced_usm_type( + ( + coerced_usm_type, + vals.usm_type, + ) + ) + if exec_q is None: + raise dpt.ExecutionPlacementError( + "Can not automatically determine where to allocate the " + "result or performance execution. " + "Use `usm_ndarray.to_device` method to migrate data to " + "be associated with the same queue." + ) + + inds = _prepare_indices_arrays(inds, exec_q, coerced_usm_type) + + ind0 = inds[0] + ary_sh = ary.shape + p_end = p + len(inds) + if 0 in ary_sh[p:p_end] and ind0.size != 0: + raise IndexError( + "cannot put into non-empty indices along an empty axis" + ) + expected_vals_shape = ary_sh[:p] + ind0.shape + ary_sh[p_end:] + if vals.dtype == ary.dtype: + rhs = vals + else: + rhs = dpt.astype(vals, ary.dtype) + rhs = dpt.broadcast_to(rhs, expected_vals_shape) + _manager = SequentialOrderManager[exec_q] + dep_ev = _manager.submitted_events + hev, put_ev = ti._put( + dst=ary, + ind=inds, + val=rhs, + axis_start=p, + mode=mode, + sycl_queue=exec_q, + depends=dep_ev, + ) + _manager.add_event_pair(hev, put_ev) + return + + +def _take_multi_index(ary, inds, p, mode=0): + if not isinstance(ary, dpt.usm_ndarray): + raise TypeError( + f"Expecting type dpnp.tensor.usm_ndarray, got {type(ary)}" + ) + ary_nd = ary.ndim + p = normalize_axis_index(operator.index(p), ary_nd) + mode = operator.index(mode) + if mode not in [0, 1]: + raise ValueError( + "Invalid value for mode keyword, only 0 or 1 is supported" + ) + if not isinstance(inds, (list, tuple)): + inds = (inds,) + + exec_q, res_usm_type = _get_indices_queue_usm_type( + inds, ary.sycl_queue, ary.usm_type + ) + if exec_q is None: + raise dpt.ExecutionPlacementError( + "Can not automatically determine where to allocate the " + "result or performance execution. " + "Use `usm_ndarray.to_device` method to migrate data to " + "be associated with the same queue." + ) + + inds = _prepare_indices_arrays(inds, exec_q, res_usm_type) + + ind0 = inds[0] + ary_sh = ary.shape + p_end = p + len(inds) + if 0 in ary_sh[p:p_end] and ind0.size != 0: + raise IndexError("cannot take non-empty indices from an empty axis") + res_shape = ary_sh[:p] + ind0.shape + ary_sh[p_end:] + res = dpt.empty( + res_shape, dtype=ary.dtype, usm_type=res_usm_type, sycl_queue=exec_q + ) + _manager = SequentialOrderManager[exec_q] + dep_ev = _manager.submitted_events + hev, take_ev = ti._take( + src=ary, + ind=inds, + dst=res, + axis_start=p, + mode=mode, + sycl_queue=exec_q, + depends=dep_ev, + ) + _manager.add_event_pair(hev, take_ev) + return res + + +def from_numpy(np_ary, /, *, device=None, usm_type="device", sycl_queue=None): + """ + from_numpy(arg, device=None, usm_type="device", sycl_queue=None) + + Creates :class:`dpctl.tensor.usm_ndarray` from instance of + :class:`numpy.ndarray`. + + Args: + arg: + Input convertible to :class:`numpy.ndarray` + device (object): array API specification of device where the + output array is created. Device can be specified by + a filter selector string, an instance of + :class:`dpctl.SyclDevice`, an instance of + :class:`dpctl.SyclQueue`, or an instance of + :class:`dpctl.tensor.Device`. If the value is ``None``, + returned array is created on the default-selected device. + Default: ``None`` + usm_type (str): The requested USM allocation type for the + output array. Recognized values are ``"device"``, + ``"shared"``, or ``"host"`` + sycl_queue (:class:`dpctl.SyclQueue`, optional): + A SYCL queue that determines output array allocation device + as well as execution placement of data movement operations. + The ``device`` and ``sycl_queue`` arguments + are equivalent. Only one of them should be specified. If both + are provided, they must be consistent and result in using the + same execution queue. Default: ``None`` + + The returned array has the same shape, and the same data type kind. + If the device does not support the data type of input array, a + closest support data type of the same kind may be returned, e.g. + input array of type ``float16`` may be upcast to ``float32`` if the + target device does not support 16-bit floating point type. + """ + q = normalize_queue_device(sycl_queue=sycl_queue, device=device) + return _copy_from_numpy(np_ary, usm_type=usm_type, sycl_queue=q) + + +def to_numpy(usm_ary, /): + """ + to_numpy(usm_ary) + + Copies content of :class:`dpctl.tensor.usm_ndarray` instance ``usm_ary`` + into :class:`numpy.ndarray` instance of the same shape and same data type. + + Args: + usm_ary (usm_ndarray): + Input array + Returns: + :class:`numpy.ndarray`: + An instance of :class:`numpy.ndarray` populated with content of + ``usm_ary`` + """ + return _copy_to_numpy(usm_ary) + + +def asnumpy(usm_ary): + """ + asnumpy(usm_ary) + + Copies content of :class:`dpctl.tensor.usm_ndarray` instance ``usm_ary`` + into :class:`numpy.ndarray` instance of the same shape and same data + type. + + Args: + usm_ary (usm_ndarray): + Input array + Returns: + :class:`numpy.ndarray`: + An instance of :class:`numpy.ndarray` populated with content + of ``usm_ary`` + """ + return _copy_to_numpy(usm_ary) + + +class Dummy: + """Helper class with specified ``__sycl_usm_array_interface__`` attribute""" + + def __init__(self, iface): + self.__sycl_usm_array_interface__ = iface + + +def _copy_overlapping(dst, src): + """Assumes src and dst have the same shape.""" + q = normalize_queue_device(sycl_queue=dst.sycl_queue) + tmp = dpt.usm_ndarray( + src.shape, + dtype=src.dtype, + buffer="device", + order="C", + buffer_ctor_kwargs={"queue": q}, + ) + _manager = SequentialOrderManager[q] + dep_evs = _manager.submitted_events + hcp1, cp1 = ti._copy_usm_ndarray_into_usm_ndarray( + src=src, dst=tmp, sycl_queue=q, depends=dep_evs + ) + _manager.add_event_pair(hcp1, cp1) + hcp2, cp2 = ti._copy_usm_ndarray_into_usm_ndarray( + src=tmp, dst=dst, sycl_queue=q, depends=[cp1] + ) + _manager.add_event_pair(hcp2, cp2) + + +def _copy_same_shape(dst, src): + """Assumes src and dst have the same shape.""" + # check that memory regions do not overlap + if ti._array_overlap(dst, src): + if src._pointer == dst._pointer and ( + src is dst + or (src.strides == dst.strides and src.dtype == dst.dtype) + ): + return + _copy_overlapping(src=src, dst=dst) + return + + copy_q = dst.sycl_queue + _manager = SequentialOrderManager[copy_q] + dep_evs = _manager.submitted_events + hev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=src, dst=dst, sycl_queue=copy_q, depends=dep_evs + ) + _manager.add_event_pair(hev, cpy_ev) + + +if hasattr(np, "broadcast_shapes"): + + def _broadcast_shapes(sh1, sh2): + return np.broadcast_shapes(sh1, sh2) + +else: + + def _broadcast_shapes(sh1, sh2): + # use arrays with zero strides, whose memory footprint + # is independent of the number of array elements + return np.broadcast( + np.empty(sh1, dtype=[]), + np.empty(sh2, dtype=[]), + ).shape + + +def _broadcast_strides(X_shape, X_strides, res_ndim): + """ + Broadcasts strides to match the given dimensions; + returns tuple type strides. + """ + out_strides = [0] * res_ndim + X_shape_len = len(X_shape) + str_dim = -X_shape_len + for i in range(X_shape_len): + shape_value = X_shape[i] + if not shape_value == 1: + out_strides[str_dim] = X_strides[i] + str_dim += 1 + + return tuple(out_strides) + + +def _copy_from_usm_ndarray_to_usm_ndarray(dst, src): + if any( + not isinstance(arg, dpt.usm_ndarray) + for arg in ( + dst, + src, + ) + ): + raise TypeError( + "Both types are expected to be dpnp.tensor.usm_ndarray, " + f"got {type(dst)} and {type(src)}." + ) + + if dst.ndim == src.ndim and dst.shape == src.shape: + _copy_same_shape(dst, src) + return + + try: + common_shape = _broadcast_shapes(dst.shape, src.shape) + except ValueError as exc: + raise ValueError("Shapes of two arrays are not compatible") from exc + + if dst.size < src.size and dst.size < np.prod(common_shape): + raise ValueError("Destination is smaller ") + + if len(common_shape) > dst.ndim: + ones_count = len(common_shape) - dst.ndim + for k in range(ones_count): + if common_shape[k] != 1: + raise ValueError + common_shape = common_shape[ones_count:] + + if src.ndim < len(common_shape): + new_src_strides = _broadcast_strides( + src.shape, src.strides, len(common_shape) + ) + src_same_shape = dpt.usm_ndarray( + common_shape, + dtype=src.dtype, + buffer=src, + strides=new_src_strides, + offset=src._element_offset, + ) + elif src.ndim == len(common_shape): + new_src_strides = _broadcast_strides( + src.shape, src.strides, len(common_shape) + ) + src_same_shape = dpt.usm_ndarray( + common_shape, + dtype=src.dtype, + buffer=src, + strides=new_src_strides, + offset=src._element_offset, + ) + else: + # since broadcasting succeeded, src.ndim is greater because of + # leading sequence of ones, so we trim it + n = len(common_shape) + new_src_strides = _broadcast_strides( + src.shape[-n:], src.strides[-n:], n + ) + src_same_shape = dpt.usm_ndarray( + common_shape, + dtype=src.dtype, + buffer=src.usm_data, + strides=new_src_strides, + offset=src._element_offset, + ) + + _copy_same_shape(dst, src_same_shape) + + +def _make_empty_like_orderK(x, dt, usm_type, dev): + """ + Returns empty array with shape and strides like `x`, with dtype `dt`, + USM type `usm_type`, on device `dev`. + """ + st = list(x.strides) + perm = sorted( + range(x.ndim), + key=lambda d: builtins.abs(st[d]) if x.shape[d] > 1 else 0, + reverse=True, + ) + inv_perm = sorted(range(x.ndim), key=lambda i: perm[i]) + sh = x.shape + sh_sorted = tuple(sh[i] for i in perm) + R = dpt.empty(sh_sorted, dtype=dt, usm_type=usm_type, device=dev, order="C") + if min(st) < 0: + st_sorted = [st[i] for i in perm] + sl = tuple( + ( + slice(None, None, -1) + if st_sorted[i] < 0 + else slice(None, None, None) + ) + for i in range(x.ndim) + ) + R = R[sl] + return dpt.permute_dims(R, inv_perm) + + +def _empty_like_orderK(x, dt, usm_type=None, dev=None): + """ + Returns empty array like `x`, using order='K' + + For an array `x` that was obtained by permutation of a contiguous + array the returned array will have the same shape and the same + strides as `x`. + """ + if not isinstance(x, dpt.usm_ndarray): + raise TypeError(f"Expected usm_ndarray, got {type(x)}") + if usm_type is None: + usm_type = x.usm_type + if dev is None: + dev = x.device + fl = x.flags + if fl["C"] or x.size <= 1: + return dpt.empty_like( + x, dtype=dt, usm_type=usm_type, device=dev, order="C" + ) + elif fl["F"]: + return dpt.empty_like( + x, dtype=dt, usm_type=usm_type, device=dev, order="F" + ) + return _make_empty_like_orderK(x, dt, usm_type, dev) + + +def _from_numpy_empty_like_orderK(x, dt, usm_type, dev): + """ + Returns empty usm_ndarray like NumPy array `x`, using order='K' + + For an array `x` that was obtained by permutation of a contiguous + array the returned array will have the same shape and the same + strides as `x`. + """ + if not isinstance(x, np.ndarray): + raise TypeError(f"Expected numpy.ndarray, got {type(x)}") + fl = x.flags + if fl["C"] or x.size <= 1: + return dpt.empty( + x.shape, dtype=dt, usm_type=usm_type, device=dev, order="C" + ) + elif fl["F"]: + return dpt.empty( + x.shape, dtype=dt, usm_type=usm_type, device=dev, order="F" + ) + return _make_empty_like_orderK(x, dt, usm_type, dev) + + +def _empty_like_pair_orderK(X1, X2, dt, res_shape, usm_type, dev): + if not isinstance(X1, dpt.usm_ndarray): + raise TypeError(f"Expected usm_ndarray, got {type(X1)}") + if not isinstance(X2, dpt.usm_ndarray): + raise TypeError(f"Expected usm_ndarray, got {type(X2)}") + nd1 = X1.ndim + nd2 = X2.ndim + if nd1 > nd2 and X1.shape == res_shape: + return _empty_like_orderK(X1, dt, usm_type, dev) + elif nd1 < nd2 and X2.shape == res_shape: + return _empty_like_orderK(X2, dt, usm_type, dev) + fl1 = X1.flags + fl2 = X2.flags + if fl1["C"] or fl2["C"]: + return dpt.empty( + res_shape, dtype=dt, usm_type=usm_type, device=dev, order="C" + ) + if fl1["F"] and fl2["F"]: + return dpt.empty( + res_shape, dtype=dt, usm_type=usm_type, device=dev, order="F" + ) + st1 = list(X1.strides) + st2 = list(X2.strides) + max_ndim = max(nd1, nd2) + st1 += [0] * (max_ndim - len(st1)) + st2 += [0] * (max_ndim - len(st2)) + sh1 = list(X1.shape) + [0] * (max_ndim - nd1) + sh2 = list(X2.shape) + [0] * (max_ndim - nd2) + perm = sorted( + range(max_ndim), + key=lambda d: ( + builtins.abs(st1[d]) if sh1[d] > 1 else 0, + builtins.abs(st2[d]) if sh2[d] > 1 else 0, + ), + reverse=True, + ) + inv_perm = sorted(range(max_ndim), key=lambda i: perm[i]) + st1_sorted = [st1[i] for i in perm] + st2_sorted = [st2[i] for i in perm] + sh = res_shape + sh_sorted = tuple(sh[i] for i in perm) + R = dpt.empty(sh_sorted, dtype=dt, usm_type=usm_type, device=dev, order="C") + if max(min(st1_sorted), min(st2_sorted)) < 0: + sl = tuple( + ( + slice(None, None, -1) + if (st1_sorted[i] < 0 and st2_sorted[i] < 0) + else slice(None, None, None) + ) + for i in range(nd1) + ) + R = R[sl] + return dpt.permute_dims(R, inv_perm) + + +def _empty_like_triple_orderK(X1, X2, X3, dt, res_shape, usm_type, dev): + if not isinstance(X1, dpt.usm_ndarray): + raise TypeError(f"Expected usm_ndarray, got {type(X1)}") + if not isinstance(X2, dpt.usm_ndarray): + raise TypeError(f"Expected usm_ndarray, got {type(X2)}") + if not isinstance(X3, dpt.usm_ndarray): + raise TypeError(f"Expected usm_ndarray, got {type(X3)}") + nd1 = X1.ndim + nd2 = X2.ndim + nd3 = X3.ndim + if X1.shape == res_shape and X2.shape == res_shape and len(res_shape) > nd3: + return _empty_like_pair_orderK(X1, X2, dt, res_shape, usm_type, dev) + elif ( + X2.shape == res_shape and X3.shape == res_shape and len(res_shape) > nd1 + ): + return _empty_like_pair_orderK(X2, X3, dt, res_shape, usm_type, dev) + elif ( + X1.shape == res_shape and X3.shape == res_shape and len(res_shape) > nd2 + ): + return _empty_like_pair_orderK(X1, X3, dt, res_shape, usm_type, dev) + fl1 = X1.flags + fl2 = X2.flags + fl3 = X3.flags + if fl1["C"] or fl2["C"] or fl3["C"]: + return dpt.empty( + res_shape, dtype=dt, usm_type=usm_type, device=dev, order="C" + ) + if fl1["F"] and fl2["F"] and fl3["F"]: + return dpt.empty( + res_shape, dtype=dt, usm_type=usm_type, device=dev, order="F" + ) + st1 = list(X1.strides) + st2 = list(X2.strides) + st3 = list(X3.strides) + max_ndim = max(nd1, nd2, nd3) + st1 += [0] * (max_ndim - len(st1)) + st2 += [0] * (max_ndim - len(st2)) + st3 += [0] * (max_ndim - len(st3)) + sh1 = list(X1.shape) + [0] * (max_ndim - nd1) + sh2 = list(X2.shape) + [0] * (max_ndim - nd2) + sh3 = list(X3.shape) + [0] * (max_ndim - nd3) + perm = sorted( + range(max_ndim), + key=lambda d: ( + builtins.abs(st1[d]) if sh1[d] > 1 else 0, + builtins.abs(st2[d]) if sh2[d] > 1 else 0, + builtins.abs(st3[d]) if sh3[d] > 1 else 0, + ), + reverse=True, + ) + inv_perm = sorted(range(max_ndim), key=lambda i: perm[i]) + st1_sorted = [st1[i] for i in perm] + st2_sorted = [st2[i] for i in perm] + st3_sorted = [st3[i] for i in perm] + sh = res_shape + sh_sorted = tuple(sh[i] for i in perm) + R = dpt.empty(sh_sorted, dtype=dt, usm_type=usm_type, device=dev, order="C") + if max(min(st1_sorted), min(st2_sorted), min(st3_sorted)) < 0: + sl = tuple( + ( + slice(None, None, -1) + if ( + st1_sorted[i] < 0 + and st2_sorted[i] < 0 + and st3_sorted[i] < 0 + ) + else slice(None, None, None) + ) + for i in range(nd1) + ) + R = R[sl] + return dpt.permute_dims(R, inv_perm) + + +def copy(usm_ary, /, *, order="K"): + """copy(ary, order="K") + + Creates a copy of given instance of :class:`dpctl.tensor.usm_ndarray`. + + Args: + ary (usm_ndarray): + Input array + order (``"C"``, ``"F"``, ``"A"``, ``"K"``, optional): + Controls the memory layout of the output array + Returns: + usm_ndarray: + A copy of the input array. + + Memory layout of the copy is controlled by ``order`` keyword, + following NumPy's conventions. The ``order`` keywords can be + one of the following: + + .. list-table:: + + * - ``"C"`` + - C-contiguous memory layout + * - ``"F"`` + - Fortran-contiguous memory layout + * - ``"A"`` + - Fortran-contiguous if the input array is also Fortran-contiguous, + otherwise C-contiguous + * - ``"K"`` + - match the layout of ``usm_ary`` as closely as possible. + + """ + if len(order) == 0 or order[0] not in "KkAaCcFf": + raise ValueError( + "Unrecognized order keyword value, expecting 'K', 'A', 'F', or 'C'." + ) + order = order[0].upper() + if not isinstance(usm_ary, dpt.usm_ndarray): + raise TypeError( + f"Expected object of type dpt.usm_ndarray, got {type(usm_ary)}" + ) + copy_order = "C" + if order == "C": + pass + elif order == "F": + copy_order = order + elif order == "A": + if usm_ary.flags.f_contiguous: + copy_order = "F" + elif order == "K": + if usm_ary.flags.f_contiguous: + copy_order = "F" + else: + raise ValueError( + "Unrecognized value of the order keyword. " + "Recognized values are 'A', 'C', 'F', or 'K'" + ) + if order == "K": + R = _empty_like_orderK(usm_ary, usm_ary.dtype) + else: + R = dpt.usm_ndarray( + usm_ary.shape, + dtype=usm_ary.dtype, + buffer=usm_ary.usm_type, + order=copy_order, + buffer_ctor_kwargs={"queue": usm_ary.sycl_queue}, + ) + _copy_same_shape(R, usm_ary) + return R + + +def astype( + usm_ary, newdtype, /, *, order="K", casting="unsafe", copy=True, device=None +): + """astype(array, new_dtype, order="K", casting="unsafe", \ + copy=True, device=None) + + Returns a copy of the :class:`dpctl.tensor.usm_ndarray`, cast to a + specified type. + + Args: + array (usm_ndarray): + An input array. + new_dtype (dtype): + The data type of the resulting array. If `None`, gives default + floating point type supported by device where the resulting array + will be located. + order ({"C", "F", "A", "K"}, optional): + Controls memory layout of the resulting array if a copy + is returned. + casting ({'no', 'equiv', 'safe', 'same_kind', 'unsafe'}, optional): + Controls what kind of data casting may occur. Please see + :meth:`numpy.ndarray.astype` for description of casting modes. + copy (bool, optional): + By default, `astype` always returns a newly allocated array. + If this keyword is set to `False`, a view of the input array + may be returned when possible. + device (object): array API specification of device where the + output array is created. Device can be specified by + a filter selector string, an instance of + :class:`dpctl.SyclDevice`, an instance of + :class:`dpctl.SyclQueue`, or an instance of + :class:`dpctl.tensor.Device`. If the value is `None`, + returned array is created on the same device as `array`. + Default: `None`. + + Returns: + usm_ndarray: + An array with requested data type. + + A view can be returned, if possible, when `copy=False` is used. + """ + if not isinstance(usm_ary, dpt.usm_ndarray): + return TypeError( + f"Expected object of type dpt.usm_ndarray, got {type(usm_ary)}" + ) + if len(order) == 0 or order[0] not in "KkAaCcFf": + raise ValueError( + "Unrecognized order keyword value, expecting 'K', 'A', 'F', or 'C'." + ) + order = order[0].upper() + ary_dtype = usm_ary.dtype + if device is not None: + if not isinstance(device, dpctl.SyclQueue): + if isinstance(device, dpt.Device): + device = device.sycl_queue + else: + device = dpt.Device.create_device(device).sycl_queue + d = device.sycl_device + target_dtype = _get_dtype(newdtype, device) + if not _dtype_supported_by_device_impl( + target_dtype, d.has_aspect_fp16, d.has_aspect_fp64 + ): + raise ValueError( + f"Requested dtype '{target_dtype}' is not supported by the " + "target device" + ) + usm_ary = usm_ary.to_device(device) + else: + target_dtype = _get_dtype(newdtype, usm_ary.sycl_queue) + + if not dpt.can_cast(ary_dtype, target_dtype, casting=casting): + raise TypeError( + f"Can not cast from {ary_dtype} to {newdtype} " + f"according to rule {casting}." + ) + c_contig = usm_ary.flags.c_contiguous + f_contig = usm_ary.flags.f_contiguous + needs_copy = copy or not ary_dtype == target_dtype + if not needs_copy and (order != "K"): + # ensure that order="F" for C-contig input triggers copy, + # and order="C" for F-contig input triggers copy too. + # 1D arrays which are both C- and F- contig should not + # force copying for neither order="F", nor order="C", see gh-1926 + needs_copy = ( + c_contig and not f_contig and order not in ["A", "C"] + ) or (not c_contig and f_contig and order not in ["A", "F"]) + if not needs_copy: + return usm_ary + copy_order = "C" + if order == "C": + pass + elif order == "F": + copy_order = order + elif order == "A": + if usm_ary.flags.f_contiguous: + copy_order = "F" + elif order == "K": + if usm_ary.flags.f_contiguous: + copy_order = "F" + else: + raise ValueError( + "Unrecognized value of the order keyword. " + "Recognized values are 'A', 'C', 'F', or 'K'" + ) + if order == "K": + R = _empty_like_orderK(usm_ary, target_dtype) + else: + R = dpt.usm_ndarray( + usm_ary.shape, + dtype=target_dtype, + buffer=usm_ary.usm_type, + order=copy_order, + buffer_ctor_kwargs={"queue": usm_ary.sycl_queue}, + ) + _copy_from_usm_ndarray_to_usm_ndarray(R, usm_ary) + return R diff --git a/dpnp/tensor/_ctors.py b/dpnp/tensor/_ctors.py new file mode 100644 index 000000000000..b6e28afdc9e7 --- /dev/null +++ b/dpnp/tensor/_ctors.py @@ -0,0 +1,1972 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import operator +from numbers import Number + +import dpctl +import dpctl.memory as dpm +import numpy as np +from dpctl.utils import SequentialOrderManager + +import dpnp.tensor as dpt +import dpnp.tensor._tensor_impl as ti + +from ._copy_utils import ( + _empty_like_orderK, + _from_numpy_empty_like_orderK, +) +from ._data_types import _get_dtype +from ._device import normalize_queue_device +from ._usmarray import _is_object_with_buffer_protocol + +__doc__ = "Implementation of creation functions in :module:`dpctl.tensor`" + +_empty_tuple = () +_host_set = frozenset([None]) + + +def _array_info_dispatch(obj): + if isinstance(obj, dpt.usm_ndarray): + return obj.shape, obj.dtype, frozenset([obj.sycl_queue]) + if isinstance(obj, np.ndarray): + return obj.shape, obj.dtype, _host_set + if isinstance(obj, range): + return (len(obj),), int, _host_set + if isinstance(obj, bool): + return _empty_tuple, bool, _host_set + if isinstance(obj, float): + return _empty_tuple, float, _host_set + if isinstance(obj, int): + return _empty_tuple, int, _host_set + if isinstance(obj, complex): + return _empty_tuple, complex, _host_set + if isinstance( + obj, + ( + list, + tuple, + ), + ): + return _array_info_sequence(obj) + if _is_object_with_buffer_protocol(obj): + np_obj = np.array(obj) + return np_obj.shape, np_obj.dtype, _host_set + if hasattr(obj, "__usm_ndarray__"): + usm_ar = obj.__usm_ndarray__ + if isinstance(usm_ar, dpt.usm_ndarray): + return usm_ar.shape, usm_ar.dtype, frozenset([usm_ar.sycl_queue]) + if hasattr(obj, "__sycl_usm_array_interface__"): + usm_ar = _usm_ndarray_from_suai(obj) + return usm_ar.shape, usm_ar.dtype, frozenset([usm_ar.sycl_queue]) + + +def _array_info_sequence(li): + if not isinstance(li, (list, tuple, range)): + raise TypeError(f"Expected list, tuple, or range, got {type(li)}") + n = len(li) + dim = None + dt = None + device = frozenset() + for el in li: + el_dim, el_dt, el_dev = _array_info_dispatch(el) + if dim is None: + dim = el_dim + dt = np.promote_types(el_dt, el_dt) + device = device.union(el_dev) + elif el_dim == dim: + dt = np.promote_types(dt, el_dt) + device = device.union(el_dev) + else: + raise ValueError(f"Inconsistent dimensions, {dim} and {el_dim}") + if dim is None: + dim = () + dt = float + device = _host_set + return (n,) + dim, dt, device + + +def _asarray_from_numpy_ndarray( + ary, dtype=None, usm_type=None, sycl_queue=None, order="K" +): + if not isinstance(ary, np.ndarray): + raise TypeError(f"Expected numpy.ndarray, got {type(ary)}") + if usm_type is None: + usm_type = "device" + copy_q = normalize_queue_device(sycl_queue=None, device=sycl_queue) + if ary.dtype.char not in "?bBhHiIlLqQefdFD": + raise TypeError( + f"Numpy array of data type {ary.dtype} is not supported. " + "Please convert the input to an array with numeric data type." + ) + if dtype is None: + # deduce device-representable output data type + dtype = _map_to_device_dtype(ary.dtype, copy_q) + _ensure_native_dtype_device_support(dtype, copy_q.sycl_device) + f_contig = ary.flags["F"] + c_contig = ary.flags["C"] + fc_contig = f_contig or c_contig + if order == "A": + order = "F" if f_contig and not c_contig else "C" + if order == "K" and fc_contig: + order = "C" if c_contig else "F" + if order == "K": + # new USM allocation + res = _from_numpy_empty_like_orderK(ary, dtype, usm_type, copy_q) + else: + res = dpt.usm_ndarray( + ary.shape, + dtype=dtype, + buffer=usm_type, + order=order, + buffer_ctor_kwargs={"queue": copy_q}, + ) + res[...] = ary + return res + + +def _asarray_from_seq( + seq_obj, + seq_shape, + seq_dt, + alloc_q, + exec_q, + dtype=None, + usm_type=None, + order="C", +): + """`seq_obj` is a sequence""" + if usm_type is None: + usm_types_in_seq = [] + _usm_types_walker(seq_obj, usm_types_in_seq) + usm_type = dpt.get_coerced_usm_type(usm_types_in_seq) + dpt.validate_usm_type(usm_type) + if dtype is None: + dtype = _map_to_device_dtype(seq_dt, alloc_q) + else: + _mapped_dt = _map_to_device_dtype(dtype, alloc_q) + if _mapped_dt != dtype: + raise ValueError( + f"Device {alloc_q.sycl_device} " + f"does not support {dtype} natively." + ) + dtype = _mapped_dt + if order in "KA": + order = "C" + if isinstance(exec_q, dpctl.SyclQueue): + res = dpt.empty( + seq_shape, + dtype=dtype, + usm_type=usm_type, + sycl_queue=alloc_q, + order=order, + ) + _manager = SequentialOrderManager[exec_q] + _device_copy_walker(seq_obj, res, _manager) + return res + else: + res = dpt.empty( + seq_shape, + dtype=dtype, + usm_type=usm_type, + sycl_queue=alloc_q, + order=order, + ) + _copy_through_host_walker(seq_obj, res) + return res + + +def _asarray_from_seq_single_device( + obj, + seq_shape, + seq_dt, + seq_dev, + dtype=None, + usm_type=None, + sycl_queue=None, + order="C", +): + if sycl_queue is None: + exec_q = seq_dev + alloc_q = seq_dev + else: + exec_q = dpt.get_execution_queue( + ( + sycl_queue, + seq_dev, + ) + ) + alloc_q = sycl_queue + return _asarray_from_seq( + obj, + seq_shape, + seq_dt, + alloc_q, + exec_q, + dtype=dtype, + usm_type=usm_type, + order=order, + ) + + +def _asarray_from_usm_ndarray( + usm_ndary, + dtype=None, + copy=None, + usm_type=None, + sycl_queue=None, + order="K", +): + if not isinstance(usm_ndary, dpt.usm_ndarray): + raise TypeError( + f"Expected dpnp.tensor.usm_ndarray, got {type(usm_ndary)}" + ) + if usm_type is None: + usm_type = usm_ndary.usm_type + if sycl_queue is not None: + exec_q = dpt.get_execution_queue([usm_ndary.sycl_queue, sycl_queue]) + copy_q = normalize_queue_device(sycl_queue=sycl_queue, device=exec_q) + else: + copy_q = usm_ndary.sycl_queue + if dtype is None: + dtype = _map_to_device_dtype(usm_ndary.dtype, copy_q) + # Conditions for zero copy: + can_zero_copy = copy is not True + # dtype is unchanged + can_zero_copy = can_zero_copy and dtype == usm_ndary.dtype + # USM allocation type is unchanged + can_zero_copy = can_zero_copy and usm_type == usm_ndary.usm_type + # sycl_queue is unchanged + can_zero_copy = can_zero_copy and copy_q is usm_ndary.sycl_queue + # order is unchanged + c_contig = usm_ndary.flags.c_contiguous + f_contig = usm_ndary.flags.f_contiguous + fc_contig = usm_ndary.flags.forc + if can_zero_copy: + if order == "C" and c_contig: + pass + elif order == "F" and f_contig: + pass + elif order == "A" and fc_contig: + pass + elif order == "K": + pass + else: + can_zero_copy = False + if copy is False and can_zero_copy is False: + raise ValueError("asarray(..., copy=False) is not possible") + if can_zero_copy: + return usm_ndary + if order == "A": + order = "F" if f_contig and not c_contig else "C" + if order == "K" and fc_contig: + order = "C" if c_contig else "F" + if order == "K": + _ensure_native_dtype_device_support(dtype, copy_q.sycl_device) + res = _empty_like_orderK(usm_ndary, dtype, usm_type, copy_q) + else: + _ensure_native_dtype_device_support(dtype, copy_q.sycl_device) + res = dpt.usm_ndarray( + usm_ndary.shape, + dtype=dtype, + buffer=usm_type, + order=order, + buffer_ctor_kwargs={"queue": copy_q}, + ) + eq = dpt.get_execution_queue([usm_ndary.sycl_queue, copy_q]) + if eq is not None: + _manager = SequentialOrderManager[eq] + dep_evs = _manager.submitted_events + hev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=usm_ndary, dst=res, sycl_queue=eq, depends=dep_evs + ) + _manager.add_event_pair(hev, cpy_ev) + else: + tmp = dpt.asnumpy(usm_ndary) + res[...] = tmp + return res + + +def _cast_fill_val(fill_val, dt): + """ + Casts the Python scalar `fill_val` to another Python type coercible to the + requested data type `dt`, if necessary. + """ + val_type = type(fill_val) + if val_type in [float, complex] and np.issubdtype(dt, np.integer): + return int(fill_val.real) + elif val_type is complex and np.issubdtype(dt, np.floating): + return fill_val.real + elif val_type is int and np.issubdtype(dt, np.integer): + return _to_scalar(fill_val, dt) + else: + return fill_val + + +def _coerce_and_infer_dt(*args, dt, sycl_queue, err_msg, allow_bool=False): + """Deduce arange type from sequence spec""" + nd, seq_dt, d = _array_info_sequence(args) + if d != _host_set or nd != (len(args),): + raise ValueError(err_msg) + dt = _get_dtype(dt, sycl_queue, ref_type=seq_dt) + if np.issubdtype(dt, np.integer): + return tuple(int(v) for v in args), dt + if np.issubdtype(dt, np.floating): + return tuple(float(v) for v in args), dt + if np.issubdtype(dt, np.complexfloating): + return tuple(complex(v) for v in args), dt + if allow_bool and dt.char == "?": + return tuple(bool(v) for v in args), dt + raise ValueError(f"Data type {dt} is not supported") + + +def _copy_through_host_walker(seq_o, usm_res): + if isinstance(seq_o, dpt.usm_ndarray): + if ( + dpt.get_execution_queue( + ( + usm_res.sycl_queue, + seq_o.sycl_queue, + ) + ) + is None + ): + usm_res[...] = dpt.asnumpy(seq_o).copy() + return + else: + usm_res[...] = seq_o + if hasattr(seq_o, "__usm_ndarray__"): + usm_arr = seq_o.__usm_ndarray__ + if isinstance(usm_arr, dpt.usm_ndarray): + _copy_through_host_walker(usm_arr, usm_res) + return + if hasattr(seq_o, "__sycl_usm_array_interface__"): + usm_ar = _usm_ndarray_from_suai(seq_o) + if ( + dpt.get_execution_queue( + ( + usm_res.sycl_queue, + usm_ar.sycl_queue, + ) + ) + is None + ): + usm_res[...] = dpt.asnumpy(usm_ar).copy() + else: + usm_res[...] = usm_ar + return + if _is_object_with_buffer_protocol(seq_o): + np_ar = np.asarray(seq_o) + usm_res[...] = np_ar + return + if isinstance(seq_o, (list, tuple)): + for i, el in enumerate(seq_o): + _copy_through_host_walker(el, usm_res[i]) + return + usm_res[...] = np.asarray(seq_o) + + +def _device_copy_walker(seq_o, res, _manager): + if isinstance(seq_o, dpt.usm_ndarray): + exec_q = res.sycl_queue + deps = _manager.submitted_events + ht_ev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=seq_o, dst=res, sycl_queue=exec_q, depends=deps + ) + _manager.add_event_pair(ht_ev, cpy_ev) + return + if hasattr(seq_o, "__usm_ndarray__"): + usm_arr = seq_o.__usm_ndarray__ + if isinstance(usm_arr, dpt.usm_ndarray): + _device_copy_walker(usm_arr, res, _manager) + return + if hasattr(seq_o, "__sycl_usm_array_interface__"): + usm_ar = _usm_ndarray_from_suai(seq_o) + exec_q = res.sycl_queue + deps = _manager.submitted_events + ht_ev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=usm_ar, dst=res, sycl_queue=exec_q, depends=deps + ) + _manager.add_event_pair(ht_ev, cpy_ev) + return + if isinstance(seq_o, (list, tuple)): + for i, el in enumerate(seq_o): + _device_copy_walker(el, res[i], _manager) + return + raise TypeError + + +def _ensure_native_dtype_device_support(dtype, dev) -> None: + """Check that dtype is natively supported by device. + + Arg: + dtype: + Elemental data-type + dev (:class:`dpctl.SyclDevice`): + The device about which the query is being made. + Returns: + None + Raise: + ValueError: + if device does not natively support this `dtype`. + """ + if dtype in [dpt.float64, dpt.complex128] and not dev.has_aspect_fp64: + raise ValueError( + f"Device {dev.name} does not provide native support " + "for double-precision floating point type." + ) + if ( + dtype + in [ + dpt.float16, + ] + and not dev.has_aspect_fp16 + ): + raise ValueError( + f"Device {dev.name} does not provide native support " + "for half-precision floating point type." + ) + + +def _get_arange_length(start, stop, step): + """Compute length of arange sequence""" + span = stop - start + if hasattr(step, "__float__") and hasattr(span, "__float__"): + return _round_for_arange(span / step) + tmp = span / step + if hasattr(tmp, "__complex__"): + tmp = complex(tmp) + tmp = tmp.real + else: + tmp = float(tmp) + return _round_for_arange(tmp) + + +def _map_to_device_dtype(dt, q): + dtc = dt.char + if dtc == "?" or np.issubdtype(dt, np.integer): + return dt + d = q.sycl_device + if np.issubdtype(dt, np.floating): + if dtc == "f": + return dt + if dtc == "d" and d.has_aspect_fp64: + return dt + if dtc == "e" and d.has_aspect_fp16: + return dt + return dpt.dtype("f4") + if np.issubdtype(dt, np.complexfloating): + if dtc == "F": + return dt + if dtc == "D" and d.has_aspect_fp64: + return dt + return dpt.dtype("c8") + raise RuntimeError(f"Unrecognized data type '{dt}' encountered.") + + +def _normalize_order(order, arr): + """ + Utility function for processing the `order` keyword of array-like + constructors, which support `"K"` and `"A"` orders. + """ + arr_flags = arr.flags + f_contig = arr_flags["F"] + c_contig = arr_flags["C"] + if order == "A": + order = "F" if f_contig and not c_contig else "C" + if order == "K" and (f_contig or c_contig): + order = "C" if c_contig else "F" + return order + + +def _round_for_arange(tmp): + k = int(tmp) + if k >= 0 and float(k) < tmp: + tmp = tmp + 1 + return tmp + + +def _to_scalar(obj, sc_ty): + """A way to convert object to NumPy scalar type. + Raises OverflowError if obj can not be represented + using the requested scalar type. + """ + zd_arr = np.asarray(obj, dtype=sc_ty) + return zd_arr[()] + + +def _usm_ndarray_from_suai(obj): + sua_iface = obj.__sycl_usm_array_interface__ + membuf = dpm.as_usm_memory(obj) + ary = dpt.usm_ndarray( + sua_iface["shape"], + dtype=sua_iface["typestr"], + buffer=membuf, + strides=sua_iface.get("strides", None), + ) + _data_field = sua_iface["data"] + if isinstance(_data_field, tuple) and len(_data_field) > 1: + ro_field = _data_field[1] + else: + ro_field = False + if ro_field: + ary.flags["W"] = False + return ary + + +def _usm_types_walker(o, usm_types_list): + if isinstance(o, dpt.usm_ndarray): + usm_types_list.append(o.usm_type) + return + if hasattr(o, "__usm_ndarray__"): + usm_arr = o.__usm_ndarray__ + if isinstance(usm_arr, dpt.usm_ndarray): + usm_types_list.append(usm_arr.usm_type) + return + if hasattr(o, "__sycl_usm_array_interface__"): + usm_ar = _usm_ndarray_from_suai(o) + usm_types_list.append(usm_ar.usm_type) + return + if _is_object_with_buffer_protocol(o): + return + if isinstance(o, (int, bool, float, complex)): + return + if isinstance(o, (list, tuple, range)): + for el in o: + _usm_types_walker(el, usm_types_list) + return + raise TypeError + + +def arange( + start, + /, + stop=None, + step=1, + *, + dtype=None, + device=None, + usm_type="device", + sycl_queue=None, +): + """ + Returns evenly spaced values within the half-open interval [start, stop) + as a one-dimensional array. + + Args: + start: + Starting point of the interval + stop: + Ending point of the interval. Default: ``None`` + step: Increment of the returned sequence. Default: ``1`` + dtype: Output array data type. Default: ``None`` + device (optional): array API concept of device where the output array + is created. ``device`` can be ``None``, a oneAPI filter selector + string, an instance of :class:`dpctl.SyclDevice` corresponding to + a non-partitioned SYCL device, an instance of + :class:`dpctl.SyclQueue`, or a :class:`dpctl.tensor.Device` object + returned by :attr:`dpctl.tensor.usm_ndarray.device`. + Default: ``None`` + usm_type (``"device"``, ``"shared"``, ``"host"``, optional): + The type of SYCL USM allocation for the output array. + Default: ``"device"`` + sycl_queue (:class:`dpctl.SyclQueue`, optional): + The SYCL queue to use + for output array allocation and copying. ``sycl_queue`` and + ``device`` are complementary arguments, i.e. use one or another. + If both are specified, a :exc:`TypeError` is raised unless both + imply the same underlying SYCL queue to be used. If both are + ``None``, a cached queue targeting default-selected device is + used for allocation and population. Default: ``None`` + + Returns: + usm_ndarray: + Array populated with evenly spaced values. + """ + if stop is None: + stop = start + start = 0 + if step is None: + step = 1 + dpt.validate_usm_type(usm_type, allow_none=False) + sycl_queue = normalize_queue_device(sycl_queue=sycl_queue, device=device) + is_bool = False + if dtype: + is_bool = (dtype is bool) or (dpt.dtype(dtype) == dpt.bool) + _, dt = _coerce_and_infer_dt( + start, + stop, + step, + dt=dpt.int8 if is_bool else dtype, + sycl_queue=sycl_queue, + err_msg="start, stop, and step must be Python scalars", + allow_bool=False, + ) + try: + tmp = _get_arange_length(start, stop, step) + sh = max(int(tmp), 0) + except TypeError: + sh = 0 + if is_bool and sh > 2: + raise ValueError("no fill-function for boolean data type") + res = dpt.usm_ndarray( + (sh,), + dtype=dt, + buffer=usm_type, + order="C", + buffer_ctor_kwargs={"queue": sycl_queue}, + ) + sc_ty = dt.type + _first = _to_scalar(start, sc_ty) + if sh > 1: + _second = _to_scalar(start + step, sc_ty) + if dt in [dpt.uint8, dpt.uint16, dpt.uint32, dpt.uint64]: + int64_ty = dpt.int64.type + _step = int64_ty(_second) - int64_ty(_first) + else: + _step = _second - _first + _step = sc_ty(_step) + else: + _step = sc_ty(1) + _start = _first + _manager = SequentialOrderManager[sycl_queue] + # populating newly allocated array, no task dependencies + hev, lin_ev = ti._linspace_step(_start, _step, res, sycl_queue) + _manager.add_event_pair(hev, lin_ev) + if is_bool: + res_out = dpt.usm_ndarray( + (sh,), + dtype=dpt.bool, + buffer=usm_type, + order="C", + buffer_ctor_kwargs={"queue": sycl_queue}, + ) + hev_cpy, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=res, dst=res_out, sycl_queue=sycl_queue, depends=[lin_ev] + ) + _manager.add_event_pair(hev_cpy, cpy_ev) + return res_out + return res + + +def asarray( + obj, + /, + *, + dtype=None, + device=None, + copy=None, + usm_type=None, + sycl_queue=None, + order="K", +): + """ + Converts input object to :class:`dpctl.tensor.usm_ndarray`. + + Args: + obj: Python object to convert. Can be an instance of + :class:`dpctl.tensor.usm_ndarray`, + an object representing SYCL USM allocation and implementing + ``__sycl_usm_array_interface__`` protocol, an instance + of :class:`numpy.ndarray`, an object supporting Python buffer + protocol, a Python scalar, or a (possibly nested) sequence of + Python scalars. + dtype (data type, optional): + output array data type. If ``dtype`` is + ``None``, the output array data type is inferred from data types in + ``obj``. Default: ``None`` + copy (`bool`, optional): + boolean indicating whether or not to copy the + input. If ``True``, always creates a copy. If ``False``, the + need to copy raises :exc:`ValueError`. If ``None``, tries to reuse + existing memory allocations if possible, but allows to perform + a copy otherwise. Default: ``None`` + order (``"C"``, ``"F"``, ``"A"``, ``"K"``, optional): + memory layout of the output array. Default: ``"K"`` + device (optional): array API concept of device where the output array + is created. ``device`` can be ``None``, a oneAPI filter selector + string, an instance of :class:`dpctl.SyclDevice` corresponding to + a non-partitioned SYCL device, an instance of + :class:`dpctl.SyclQueue`, or a :class:`dpctl.tensor.Device` object + returned by :attr:`dpctl.tensor.usm_ndarray.device`. + Default: ``None`` + usm_type (``"device"``, ``"shared"``, ``"host"``, optional): + The type of SYCL USM allocation for the output array. + Default: ``"device"`` + sycl_queue (:class:`dpctl.SyclQueue`, optional): + The SYCL queue to use + for output array allocation and copying. ``sycl_queue`` and + ``device`` are complementary arguments, i.e. use one or another. + If both are specified, a :exc:`TypeError` is raised unless both + imply the same underlying SYCL queue to be used. If both are + ``None``, a cached queue targeting default-selected device is + used for allocation and population. Default: ``None`` + + Returns: + usm_ndarray: + Array created from input object. + """ + # 1. Check that copy is a valid keyword + if copy not in [None, True, False]: + raise TypeError( + "Recognized copy keyword values should be True, False, or None" + ) + # 2. Check that dtype is None, or a valid dtype + if dtype is not None: + dtype = dpt.dtype(dtype) + # 3. Validate order + if not isinstance(order, str): + raise TypeError( + f"Expected order keyword to be of type str, got {type(order)}" + ) + if len(order) == 0 or order[0] not in "KkAaCcFf": + raise ValueError( + "Unrecognized order keyword value, expecting 'K', 'A', 'F', or 'C'." + ) + order = order[0].upper() + # 4. Check that usm_type is None, or a valid value + dpt.validate_usm_type(usm_type, allow_none=True) + # 5. Normalize device/sycl_queue [keep it None if was None] + if device is not None or sycl_queue is not None: + sycl_queue = normalize_queue_device( + sycl_queue=sycl_queue, device=device + ) + + # handle instance(obj, usm_ndarray) + if isinstance(obj, dpt.usm_ndarray): + return _asarray_from_usm_ndarray( + obj, + dtype=dtype, + copy=copy, + usm_type=usm_type, + sycl_queue=sycl_queue, + order=order, + ) + if hasattr(obj, "__usm_ndarray__"): + usm_arr = obj.__usm_ndarray__ + if isinstance(usm_arr, dpt.usm_ndarray): + return _asarray_from_usm_ndarray( + usm_arr, + dtype=dtype, + copy=copy, + usm_type=usm_type, + sycl_queue=sycl_queue, + order=order, + ) + if hasattr(obj, "__sycl_usm_array_interface__"): + ary = _usm_ndarray_from_suai(obj) + return _asarray_from_usm_ndarray( + ary, + dtype=dtype, + copy=copy, + usm_type=usm_type, + sycl_queue=sycl_queue, + order=order, + ) + if isinstance(obj, np.ndarray): + if copy is False: + raise ValueError( + "Converting numpy.ndarray to usm_ndarray requires a copy" + ) + return _asarray_from_numpy_ndarray( + obj, + dtype=dtype, + usm_type=usm_type, + sycl_queue=sycl_queue, + order=order, + ) + if _is_object_with_buffer_protocol(obj): + if copy is False: + raise ValueError( + f"Converting {type(obj)} to usm_ndarray requires a copy" + ) + return _asarray_from_numpy_ndarray( + np.array(obj), + dtype=dtype, + usm_type=usm_type, + sycl_queue=sycl_queue, + order=order, + ) + if isinstance(obj, (list, tuple, range)): + if copy is False: + raise ValueError( + "Converting Python sequence to usm_ndarray requires a copy" + ) + seq_shape, seq_dt, devs = _array_info_sequence(obj) + if devs == _host_set: + return _asarray_from_numpy_ndarray( + np.asarray(obj, dtype=dtype, order=order), + dtype=dtype, + usm_type=usm_type, + sycl_queue=sycl_queue, + order=order, + ) + elif len(devs) == 1: + seq_dev = list(devs)[0] + return _asarray_from_seq_single_device( + obj, + seq_shape, + seq_dt, + seq_dev, + dtype=dtype, + usm_type=usm_type, + sycl_queue=sycl_queue, + order=order, + ) + elif len(devs) > 1: + devs = [dev for dev in devs if dev is not None] + if sycl_queue is None: + if len(devs) == 1: + alloc_q = devs[0] + else: + raise dpt.ExecutionPlacementError( + "Please specify `device` or `sycl_queue` keyword " + "argument to determine where to allocate the " + "resulting array." + ) + else: + alloc_q = sycl_queue + return _asarray_from_seq( + obj, + seq_shape, + seq_dt, + alloc_q, + # force copying via host + None, + dtype=dtype, + usm_type=usm_type, + order=order, + ) + if copy is False: + raise ValueError( + f"Converting {type(obj)} to usm_ndarray requires a copy" + ) + # obj is a scalar, create 0d array + return _asarray_from_numpy_ndarray( + np.asarray(obj, dtype=dtype), + dtype=dtype, + usm_type=usm_type, + sycl_queue=sycl_queue, + order="C", + ) + + +def empty( + shape, + *, + dtype=None, + order="C", + device=None, + usm_type="device", + sycl_queue=None, +): + """ + Creates :class:`dpctl.tensor.usm_ndarray` from uninitialized + USM allocation. + + Args: + shape (Tuple[int], int): + Dimensions of the array to be created. + dtype (optional): + data type of the array. Can be typestring, + a :class:`numpy.dtype` object, :mod:`numpy` char string, + or a NumPy scalar type. The ``None`` value creates an + array of floating point data type. Default: ``None`` + order (``"C"``, or ``F"``): + memory layout for the array. Default: ``"C"`` + device (optional): array API concept of device where the output array + is created. ``device`` can be ``None``, a oneAPI filter selector + string, an instance of :class:`dpctl.SyclDevice` corresponding to + a non-partitioned SYCL device, an instance of + :class:`dpctl.SyclQueue`, or a :class:`dpctl.tensor.Device` object + returned by :attr:`dpctl.tensor.usm_ndarray.device`. + Default: ``None`` + usm_type (``"device"``, ``"shared"``, ``"host"``, optional): + The type of SYCL USM allocation for the output array. + Default: ``"device"`` + sycl_queue (:class:`dpctl.SyclQueue`, optional): + The SYCL queue to use + for output array allocation and copying. ``sycl_queue`` and + ``device`` are complementary arguments, i.e. use one or another. + If both are specified, a :exc:`TypeError` is raised unless both + imply the same underlying SYCL queue to be used. If both are + ``None``, a cached queue targeting default-selected device is + used for allocation and population. Default: ``None`` + + Returns: + usm_ndarray: + Created empty array. + """ + if not isinstance(order, str) or len(order) == 0 or order[0] not in "CcFf": + raise ValueError( + "Unrecognized order keyword value, expecting 'F' or 'C'." + ) + order = order[0].upper() + dpt.validate_usm_type(usm_type, allow_none=False) + sycl_queue = normalize_queue_device(sycl_queue=sycl_queue, device=device) + dtype = _get_dtype(dtype, sycl_queue) + _ensure_native_dtype_device_support(dtype, sycl_queue.sycl_device) + res = dpt.usm_ndarray( + shape, + dtype=dtype, + buffer=usm_type, + order=order, + buffer_ctor_kwargs={"queue": sycl_queue}, + ) + return res + + +def empty_like( + x, /, *, dtype=None, order="K", device=None, usm_type=None, sycl_queue=None +): + """ + Returns an uninitialized :class:`dpctl.tensor.usm_ndarray` with the + same `shape` as the input array `x`. + + Args: + x (usm_ndarray): + Input array from which to derive the output array shape. + dtype (optional): + data type of the array. Can be a typestring, + a :class:`numpy.dtype` object, NumPy char string, + or a NumPy scalar type. Default: ``None`` + order ("C", "F", "A", or "K"): + memory layout for the array. Default: ``"K"`` + device (optional): array API concept of device where the output array + is created. ``device`` can be ``None``, a oneAPI filter selector + string, an instance of :class:`dpctl.SyclDevice` corresponding to + a non-partitioned SYCL device, an instance of + :class:`dpctl.SyclQueue`, or a :class:`dpctl.tensor.Device` object + returned by :attr:`dpctl.tensor.usm_ndarray.device`. + Default: ``None`` + usm_type (``"device"``, ``"shared"``, ``"host"``, optional): + The type of SYCL USM allocation for the output array. + Default: ``"device"`` + sycl_queue (:class:`dpctl.SyclQueue`, optional): + The SYCL queue to use + for output array allocation and copying. ``sycl_queue`` and + ``device`` are complementary arguments, i.e. use one or another. + If both are specified, a :exc:`TypeError` is raised unless both + imply the same underlying SYCL queue to be used. If both are + ``None``, a cached queue targeting default-selected device is + used for allocation. Default: ``None`` + + Returns: + usm_ndarray: + Created empty array with uninitialized memory. + """ + if not isinstance(x, dpt.usm_ndarray): + raise TypeError(f"Expected instance of dpt.usm_ndarray, got {type(x)}.") + if ( + not isinstance(order, str) + or len(order) == 0 + or order[0] not in "CcFfAaKk" + ): + raise ValueError( + "Unrecognized order keyword value, expecting 'C', 'F', 'A', or 'K'." + ) + order = order[0].upper() + if dtype is None: + dtype = x.dtype + if usm_type is None: + usm_type = x.usm_type + dpt.validate_usm_type(usm_type, allow_none=False) + if device is None and sycl_queue is None: + device = x.device + sycl_queue = normalize_queue_device(sycl_queue=sycl_queue, device=device) + dtype = dpt.dtype(dtype) + order = _normalize_order(order, x) + if order == "K": + _ensure_native_dtype_device_support(dtype, sycl_queue.sycl_device) + return _empty_like_orderK(x, dtype, usm_type, sycl_queue) + else: + shape = x.shape + _ensure_native_dtype_device_support(dtype, sycl_queue.sycl_device) + res = dpt.usm_ndarray( + shape, + dtype=dtype, + buffer=usm_type, + order=order, + buffer_ctor_kwargs={"queue": sycl_queue}, + ) + return res + + +def eye( + n_rows, + n_cols=None, + /, + *, + k=0, + dtype=None, + order="C", + device=None, + usm_type="device", + sycl_queue=None, +): + """ + eye(n_rows, n_cols=None, /, *, k=0, dtype=None, \ + device=None, usm_type="device", sycl_queue=None) + + Creates :class:`dpctl.tensor.usm_ndarray` with ones on the `k`-th + diagonal. + + Args: + n_rows (int): + number of rows in the output array. + n_cols (int, optional): + number of columns in the output array. If ``None``, + ``n_cols = n_rows``. Default: ``None`` + k (int): + index of the diagonal, with ``0`` as the main diagonal. + A positive value of ``k`` is a superdiagonal, a negative value + is a subdiagonal. + Raises :exc:`TypeError` if ``k`` is not an integer. + Default: ``0`` + dtype (optional): + data type of the array. Can be typestring, + a :class:`numpy.dtype` object, :mod:`numpy` char string, or + a NumPy scalar type. Default: ``None`` + order ("C" or "F"): + memory layout for the array. Default: ``"C"`` + device (optional): + array API concept of device where the output array + is created. ``device`` can be ``None``, a oneAPI filter selector + string, an instance of :class:`dpctl.SyclDevice` corresponding to + a non-partitioned SYCL device, an instance of + :class:`dpctl.SyclQueue`, or a :class:`dpctl.tensor.Device` object + returned by :attr:`dpctl.tensor.usm_ndarray.device`. + Default: ``None`` + usm_type (``"device"``, ``"shared"``, ``"host"``, optional): + The type of SYCL USM allocation for the output array. + Default: ``"device"`` + sycl_queue (:class:`dpctl.SyclQueue`, optional): + The SYCL queue to use + for output array allocation and copying. ``sycl_queue`` and + ``device`` are complementary arguments, i.e. use one or another. + If both are specified, a :exc:`TypeError` is raised unless both + imply the same underlying SYCL queue to be used. If both are + ``None``, a cached queue targeting default-selected device is + used for allocation and population. Default: ``None`` + + Returns: + usm_ndarray: + A diagonal matrix. + """ + if not isinstance(order, str) or len(order) == 0 or order[0] not in "CcFf": + raise ValueError( + "Unrecognized order keyword value, expecting 'F' or 'C'." + ) + order = order[0].upper() + n_rows = operator.index(n_rows) + n_cols = n_rows if n_cols is None else operator.index(n_cols) + k = operator.index(k) + if k >= n_cols or -k >= n_rows: + return dpt.zeros( + (n_rows, n_cols), + dtype=dtype, + order=order, + device=device, + usm_type=usm_type, + sycl_queue=sycl_queue, + ) + dpt.validate_usm_type(usm_type, allow_none=False) + sycl_queue = normalize_queue_device(sycl_queue=sycl_queue, device=device) + dtype = _get_dtype(dtype, sycl_queue) + _ensure_native_dtype_device_support(dtype, sycl_queue.sycl_device) + res = dpt.usm_ndarray( + (n_rows, n_cols), + dtype=dtype, + buffer=usm_type, + order=order, + buffer_ctor_kwargs={"queue": sycl_queue}, + ) + if n_rows != 0 and n_cols != 0: + _manager = SequentialOrderManager[sycl_queue] + hev, eye_ev = ti._eye(k, dst=res, sycl_queue=sycl_queue) + _manager.add_event_pair(hev, eye_ev) + return res + + +def _validate_fill_value(fill_val): + """Validates that `fill_val` is a numeric or boolean scalar.""" + # TODO: verify if `np.True_` and `np.False_` should be instances of + # Number in NumPy, like other NumPy scalars and like Python bools + # check for `np.bool_` separately as NumPy<2 has no `np.bool` + if not isinstance(fill_val, Number) and not isinstance(fill_val, np.bool_): + raise TypeError( + f"array cannot be filled with scalar of type {type(fill_val)}" + ) + + +def full( + shape, + fill_value, + *, + dtype=None, + order="C", + device=None, + usm_type=None, + sycl_queue=None, +): + """ + Returns a new :class:`dpctl.tensor.usm_ndarray` having a specified + shape and filled with `fill_value`. + + Args: + shape (tuple): + Dimensions of the array to be created. + fill_value (int,float,complex,usm_ndarray): + fill value + dtype (optional): data type of the array. Can be typestring, + a :class:`numpy.dtype` object, :mod:`numpy` char string, + or a NumPy scalar type. Default: ``None`` + order ("C", or "F"): + memory layout for the array. Default: ``"C"`` + device (optional): array API concept of device where the output array + is created. ``device`` can be ``None``, a oneAPI filter selector + string, an instance of :class:`dpctl.SyclDevice` corresponding to + a non-partitioned SYCL device, an instance of + :class:`dpctl.SyclQueue`, or a :class:`dpctl.tensor.Device` object + returned by :attr:`dpctl.tensor.usm_ndarray.device`. + Default: ``None`` + usm_type (``"device"``, ``"shared"``, ``"host"``, optional): + The type of SYCL USM allocation for the output array. + Default: ``"device"`` + sycl_queue (:class:`dpctl.SyclQueue`, optional): + The SYCL queue to use + for output array allocation and copying. ``sycl_queue`` and + ``device`` are complementary arguments, i.e. use one or another. + If both are specified, a :exc:`TypeError` is raised unless both + imply the same underlying SYCL queue to be used. If both are + ``None``, a cached queue targeting default-selected device is + used for allocation and population. Default: ``None`` + + Returns: + usm_ndarray: + New array initialized with given value. + """ + if not isinstance(order, str) or len(order) == 0 or order[0] not in "CcFf": + raise ValueError( + "Unrecognized order keyword value, expecting 'F' or 'C'." + ) + order = order[0].upper() + dpt.validate_usm_type(usm_type, allow_none=True) + + if isinstance(fill_value, (dpt.usm_ndarray, np.ndarray, tuple, list)): + if ( + isinstance(fill_value, dpt.usm_ndarray) + and sycl_queue is None + and device is None + ): + sycl_queue = fill_value.sycl_queue + else: + sycl_queue = normalize_queue_device( + sycl_queue=sycl_queue, device=device + ) + X = dpt.asarray( + fill_value, + dtype=dtype, + order=order, + usm_type=usm_type, + sycl_queue=sycl_queue, + ) + return dpt.copy(dpt.broadcast_to(X, shape), order=order) + else: + _validate_fill_value(fill_value) + + sycl_queue = normalize_queue_device(sycl_queue=sycl_queue, device=device) + usm_type = usm_type if usm_type is not None else "device" + dtype = _get_dtype(dtype, sycl_queue, ref_type=type(fill_value)) + res = dpt.usm_ndarray( + shape, + dtype=dtype, + buffer=usm_type, + order=order, + buffer_ctor_kwargs={"queue": sycl_queue}, + ) + fill_value = _cast_fill_val(fill_value, dtype) + + _manager = SequentialOrderManager[sycl_queue] + # populating new allocation, no dependent events + hev, full_ev = ti._full_usm_ndarray(fill_value, res, sycl_queue) + _manager.add_event_pair(hev, full_ev) + return res + + +def full_like( + x, + /, + fill_value, + *, + dtype=None, + order="K", + device=None, + usm_type=None, + sycl_queue=None, +): + """full_like(x, fill_value, dtype=None, order="K", \ + device=None, usm_type=None, sycl_queue=None) + + Returns a new :class:`dpctl.tensor.usm_ndarray` filled with `fill_value` + and having the same `shape` as the input array `x`. + + Args: + x (usm_ndarray): Input array from which to derive the output array + shape. + fill_value: the value to fill output array with + dtype (optional): + data type of the array. Can be typestring, + a :class:`numpy.dtype` object, :mod:`numpy` char string, or a + NumPy scalar type. If ``dtype`` is ``None``, the output array data + type is inferred from ``x``. Default: ``None`` + order ("C", "F", "A", or "K"): + memory layout for the array. Default: ``"K"`` + device (optional): + array API concept of device where the output array + is created. ``device`` can be ``None``, a oneAPI filter selector + string, an instance of :class:`dpctl.SyclDevice` corresponding to + a non-partitioned SYCL device, an instance of + :class:`dpctl.SyclQueue`, or a :class:`dpctl.tensor.Device` object + returned by :attr:`dpctl.tensor.usm_ndarray.device`. + Default: ``None`` + usm_type (``"device"``, ``"shared"``, ``"host"``, optional): + The type of SYCL USM allocation for the output array. + Default: ``"device"`` + sycl_queue (:class:`dpctl.SyclQueue`, optional): + The SYCL queue to use + for output array allocation and copying. ``sycl_queue`` and + ``device`` are complementary arguments, i.e. use one or another. + If both are specified, a :exc:`TypeError` is raised unless both + imply the same underlying SYCL queue to be used. If both are + ``None``, a cached queue targeting default-selected device is + used for allocation and population. Default: ``None`` + + Returns: + usm_ndarray: + New array initialized with given value. + """ + if not isinstance(x, dpt.usm_ndarray): + raise TypeError(f"Expected instance of dpt.usm_ndarray, got {type(x)}.") + if ( + not isinstance(order, str) + or len(order) == 0 + or order[0] not in "CcFfAaKk" + ): + raise ValueError( + "Unrecognized order keyword value, expecting 'C', 'F', 'A', or 'K'." + ) + order = order[0].upper() + if dtype is None: + dtype = x.dtype + if usm_type is None: + usm_type = x.usm_type + dpt.validate_usm_type(usm_type, allow_none=False) + if device is None and sycl_queue is None: + device = x.device + sycl_queue = normalize_queue_device(sycl_queue=sycl_queue, device=device) + sh = x.shape + dtype = dpt.dtype(dtype) + order = _normalize_order(order, x) + if order == "K": + _ensure_native_dtype_device_support(dtype, sycl_queue.sycl_device) + if isinstance(fill_value, (dpt.usm_ndarray, np.ndarray, tuple, list)): + X = dpt.asarray( + fill_value, + dtype=dtype, + order=order, + usm_type=usm_type, + sycl_queue=sycl_queue, + ) + X = dpt.broadcast_to(X, sh) + res = _empty_like_orderK(x, dtype, usm_type, sycl_queue) + _manager = SequentialOrderManager[sycl_queue] + # order copy after tasks populating X + dep_evs = _manager.submitted_events + hev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=X, dst=res, sycl_queue=sycl_queue, depends=dep_evs + ) + _manager.add_event_pair(hev, copy_ev) + return res + else: + _validate_fill_value(fill_value) + + dtype = _get_dtype(dtype, sycl_queue, ref_type=type(fill_value)) + res = _empty_like_orderK(x, dtype, usm_type, sycl_queue) + fill_value = _cast_fill_val(fill_value, dtype) + _manager = SequentialOrderManager[sycl_queue] + # populating new allocation, no dependent events + hev, full_ev = ti._full_usm_ndarray(fill_value, res, sycl_queue) + _manager.add_event_pair(hev, full_ev) + return res + else: + return full( + sh, + fill_value, + dtype=dtype, + order=order, + device=device, + usm_type=usm_type, + sycl_queue=sycl_queue, + ) + + +def linspace( + start, + stop, + /, + num, + *, + dtype=None, + device=None, + endpoint=True, + sycl_queue=None, + usm_type="device", +): + """ + linspace(start, stop, num, dtype=None, device=None, endpoint=True, \ + sycl_queue=None, usm_type="device") + + Returns :class:`dpctl.tensor.usm_ndarray` array populated with + evenly spaced numbers of specified interval. + + Args: + start: + the start of the interval. + stop: + the end of the interval. If the ``endpoint`` is ``False``, the + function generates ``num+1`` evenly spaced points starting + with ``start`` and ending with ``stop`` and exclude the + ``stop`` from the returned array such that the returned array + consists of evenly spaced numbers over the half-open interval + ``[start, stop)``. If ``endpoint`` is ``True``, the output + array consists of evenly spaced numbers over the closed + interval ``[start, stop]``. Default: ``True`` + num (int): + number of samples. Must be a non-negative integer; otherwise, + the function raises ``ValueError`` exception. + dtype: + output array data type. Should be a floating data type. + If ``dtype`` is ``None``, the output array must be the default + floating point data type for target device. + Default: ``None`` + device (optional): + array API concept of device where the output array + is created. ``device`` can be ``None``, a oneAPI filter selector + string, an instance of :class:`dpctl.SyclDevice` corresponding to + a non-partitioned SYCL device, an instance of + :class:`dpctl.SyclQueue`, or a :class:`dpctl.tensor.Device` object + returned by :attr:`dpctl.tensor.usm_ndarray.device`. + Default: ``None`` + usm_type (``"device"``, ``"shared"``, ``"host"``, optional): + The type of SYCL USM allocation for the output array. + Default: ``"device"`` + sycl_queue (:class:`dpctl.SyclQueue`, optional): + The SYCL queue to use + for output array allocation and copying. ``sycl_queue`` and + ``device`` are complementary arguments, i.e. use one or another. + If both are specified, a :exc:`TypeError` is raised unless both + imply the same underlying SYCL queue to be used. If both are + ``None``, a cached queue targeting default-selected device is + used for allocation and population. Default: ``None`` + endpoint: boolean indicating whether to include ``stop`` in the + interval. Default: ``True`` + + Returns: + usm_ndarray: + Array populated with evenly spaced numbers in the requested + interval. + """ + sycl_queue = normalize_queue_device(sycl_queue=sycl_queue, device=device) + dpt.validate_usm_type(usm_type, allow_none=False) + if endpoint not in [True, False]: + raise TypeError("endpoint keyword argument must be of boolean type") + + num = operator.index(num) + if num < 0: + raise ValueError("Number of points must be non-negative") + + _, dt = _coerce_and_infer_dt( + start, + stop, + dt=dtype, + sycl_queue=sycl_queue, + err_msg="start and stop must be Python scalars.", + allow_bool=True, + ) + + int_dt = None + if np.issubdtype(dt, np.integer): + if dtype is not None: + int_dt = dt + dt = ti.default_device_fp_type(sycl_queue) + dt = dpt.dtype(dt) + start = float(start) + stop = float(stop) + + res = dpt.empty(num, dtype=dt, usm_type=usm_type, sycl_queue=sycl_queue) + _manager = SequentialOrderManager[sycl_queue] + hev, la_ev = ti._linspace_affine( + start, stop, dst=res, include_endpoint=endpoint, sycl_queue=sycl_queue + ) + _manager.add_event_pair(hev, la_ev) + + return res if int_dt is None else dpt.astype(res, int_dt) + + +def meshgrid(*arrays, indexing="xy"): + """ + Creates list of :class:`dpctl.tensor.usm_ndarray` coordinate matrices + from vectors. + + Args: + arrays (usm_ndarray): + an arbitrary number of one-dimensional arrays + representing grid coordinates. Each array should have the same + numeric data type. + indexing (``"xy"``, or ``"ij"``): + Cartesian (``"xy"``) or matrix (``"ij"``) indexing of output. + If provided zero or one one-dimensional vector(s) (i.e., the + zero- and one-dimensional cases, respectively), the ``indexing`` + keyword has no effect and should be ignored. Default: ``"xy"`` + + Returns: + List[array]: + list of ``N`` arrays, where ``N`` is the number of + provided one-dimensional input arrays. Each returned array must + have rank ``N``. + For a set of ``n`` vectors with lengths ``N0``, ``N1``, ``N2``, ... + The cartesian indexing results in arrays of shape + ``(N1, N0, N2, ...)``, while the + matrix indexing results in arrays of shape + ``(N0, N1, N2, ...)``. + Default: ``"xy"``. + + Raises: + ValueError: If vectors are not of the same data type, or are not + one-dimensional. + + """ + ref_dt = None + ref_unset = True + for array in arrays: + if not isinstance(array, dpt.usm_ndarray): + raise TypeError( + f"Expected instance of dpt.usm_ndarray, got {type(array)}." + ) + if array.ndim != 1: + raise ValueError("All arrays must be one-dimensional.") + if ref_unset: + ref_unset = False + ref_dt = array.dtype + else: + if not ref_dt == array.dtype: + raise ValueError( + "All arrays must be of the same numeric data type." + ) + if indexing not in ["xy", "ij"]: + raise ValueError( + "Unrecognized indexing keyword value, expecting 'xy' or 'ij.'" + ) + n = len(arrays) + if n == 0: + return [] + + sh = (-1,) + (1,) * (n - 1) + + res = [] + if n > 1 and indexing == "xy": + res.append(dpt.reshape(arrays[0], (1, -1) + sh[2:], copy=True)) + res.append(dpt.reshape(arrays[1], sh, copy=True)) + arrays, sh = arrays[2:], sh[-2:] + sh[:-2] + + for array in arrays: + res.append(dpt.reshape(array, sh, copy=True)) + sh = sh[-1:] + sh[:-1] + + output = dpt.broadcast_arrays(*res) + + return output + + +def ones( + shape, + *, + dtype=None, + order="C", + device=None, + usm_type="device", + sycl_queue=None, +): + """ones(shape, dtype=None, order="C", \ + device=None, usm_type="device", sycl_queue=None) + + Returns a new :class:`dpctl.tensor.usm_ndarray` having a specified + shape and filled with ones. + + Args: + shape (Tuple[int], int): + Dimensions of the array to be created. + dtype (optional): + data type of the array. Can be typestring, + a :class:`numpy.dtype` object, :mod:`numpy` char string, + or a NumPy scalar type. Default: ``None`` + order ("C", or "F"): memory layout for the array. Default: ``"C"`` + device (optional): array API concept of device where the output array + is created. ``device`` can be ``None``, a oneAPI filter selector + string, an instance of :class:`dpctl.SyclDevice` corresponding to + a non-partitioned SYCL device, an instance of + :class:`dpctl.SyclQueue`, or a :class:`dpctl.tensor.Device` object + returned by :attr:`dpctl.tensor.usm_ndarray.device`. + Default: ``None`` + usm_type (``"device"``, ``"shared"``, ``"host"``, optional): + The type of SYCL USM allocation for the output array. + Default: ``"device"`` + sycl_queue (:class:`dpctl.SyclQueue`, optional): + The SYCL queue to use + for output array allocation and copying. ``sycl_queue`` and + ``device`` are complementary arguments, i.e. use one or another. + If both are specified, a :exc:`TypeError` is raised unless both + imply the same underlying SYCL queue to be used. If both are + ``None``, a cached queue targeting default-selected device is + used for allocation and population. Default: ``None`` + + Returns: + usm_ndarray: + Created array initialized with ones. + """ + if not isinstance(order, str) or len(order) == 0 or order[0] not in "CcFf": + raise ValueError( + "Unrecognized order keyword value, expecting 'F' or 'C'." + ) + order = order[0].upper() + dpt.validate_usm_type(usm_type, allow_none=False) + sycl_queue = normalize_queue_device(sycl_queue=sycl_queue, device=device) + dtype = _get_dtype(dtype, sycl_queue) + res = dpt.usm_ndarray( + shape, + dtype=dtype, + buffer=usm_type, + order=order, + buffer_ctor_kwargs={"queue": sycl_queue}, + ) + _manager = SequentialOrderManager[sycl_queue] + # populating new allocation, no dependent events + hev, full_ev = ti._full_usm_ndarray(1, res, sycl_queue) + _manager.add_event_pair(hev, full_ev) + return res + + +def ones_like( + x, /, *, dtype=None, order="K", device=None, usm_type=None, sycl_queue=None +): + """ + Returns a new :class:`dpctl.tensor.usm_ndarray` filled with ones and + having the same `shape` as the input array `x`. + + Args: + x (usm_ndarray): + Input array from which to derive the output array shape + dtype (optional): + data type of the array. Can be typestring, + a :class:`numpy.dtype` object, :mod:`numpy` char string, + or a NumPy scalar type. Default: `None` + order ("C", "F", "A", or "K"): + memory layout for the array. Default: ``"C"`` + device (optional): + array API concept of device where the output array + is created. ``device`` can be ``None``, a oneAPI filter selector + string, an instance of :class:`dpctl.SyclDevice` corresponding to + a non-partitioned SYCL device, an instance of + :class:`dpctl.SyclQueue`, or a :class:`dpctl.tensor.Device` object + returned by :attr:`dpctl.tensor.usm_ndarray.device`. + Default: ``None`` + usm_type (``"device"``, ``"shared"``, ``"host"``, optional): + The type of SYCL USM allocation for the output array. + Default: ``"device"`` + sycl_queue (:class:`dpctl.SyclQueue`, optional): + The SYCL queue to use + for output array allocation and copying. ``sycl_queue`` and + ``device`` are complementary arguments, i.e. use one or another. + If both are specified, a :exc:`TypeError` is raised unless both + imply the same underlying SYCL queue to be used. If both are + ``None``, a cached queue targeting default-selected device is + used for allocation and population. Default: ``None`` + + Returns: + usm_ndarray: + New array initialized with ones. + """ + if not isinstance(x, dpt.usm_ndarray): + raise TypeError(f"Expected instance of dpt.usm_ndarray, got {type(x)}.") + if ( + not isinstance(order, str) + or len(order) == 0 + or order[0] not in "CcFfAaKk" + ): + raise ValueError( + "Unrecognized order keyword value, expecting 'C', 'F', 'A', or 'K'." + ) + order = order[0].upper() + if dtype is None: + dtype = x.dtype + if usm_type is None: + usm_type = x.usm_type + dpt.validate_usm_type(usm_type, allow_none=False) + if device is None and sycl_queue is None: + device = x.device + sycl_queue = normalize_queue_device(sycl_queue=sycl_queue, device=device) + dtype = dpt.dtype(dtype) + order = _normalize_order(order, x) + if order == "K": + _ensure_native_dtype_device_support(dtype, sycl_queue.sycl_device) + res = _empty_like_orderK(x, dtype, usm_type, sycl_queue) + _manager = SequentialOrderManager[sycl_queue] + # populating new allocation, no dependent events + hev, full_ev = ti._full_usm_ndarray(1, res, sycl_queue) + _manager.add_event_pair(hev, full_ev) + return res + else: + sh = x.shape + return ones( + sh, + dtype=dtype, + order=order, + device=device, + usm_type=usm_type, + sycl_queue=sycl_queue, + ) + + +def tril(x, /, *, k=0): + """ + Returns the lower triangular part of a matrix (or a stack of matrices) + ``x``. + + The lower triangular part of the matrix is defined as the elements on and + below the specified diagonal ``k``. + + Args: + x (usm_ndarray): + Input array + k (int, optional): + Specifies the diagonal above which to set + elements to zero. If ``k = 0``, the diagonal is the main diagonal. + If ``k < 0``, the diagonal is below the main diagonal. + If ``k > 0``, the diagonal is above the main diagonal. + Default: ``0`` + + Returns: + usm_ndarray: + A lower-triangular array or a stack of lower-triangular arrays. + """ + if not isinstance(x, dpt.usm_ndarray): + raise TypeError( + "Expected argument of type dpnp.tensor.usm_ndarray, " + f"got {type(x)}." + ) + + k = operator.index(k) + + order = "F" if (x.flags.f_contiguous) else "C" + + shape = x.shape + nd = x.ndim + if nd < 2: + raise ValueError("Array dimensions less than 2.") + + q = x.sycl_queue + if k >= shape[nd - 1] - 1: + res = dpt.empty( + x.shape, + dtype=x.dtype, + order=order, + usm_type=x.usm_type, + sycl_queue=q, + ) + _manager = SequentialOrderManager[q] + dep_evs = _manager.submitted_events + hev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=x, dst=res, sycl_queue=q, depends=dep_evs + ) + _manager.add_event_pair(hev, cpy_ev) + elif k < -shape[nd - 2]: + res = dpt.zeros( + x.shape, + dtype=x.dtype, + order=order, + usm_type=x.usm_type, + sycl_queue=q, + ) + else: + res = dpt.empty( + x.shape, + dtype=x.dtype, + order=order, + usm_type=x.usm_type, + sycl_queue=q, + ) + _manager = SequentialOrderManager[q] + dep_evs = _manager.submitted_events + hev, tril_ev = ti._tril( + src=x, dst=res, k=k, sycl_queue=q, depends=dep_evs + ) + _manager.add_event_pair(hev, tril_ev) + + return res + + +def triu(x, /, *, k=0): + """ + Returns the upper triangular part of a matrix (or a stack of matrices) + ``x``. + + The upper triangular part of the matrix is defined as the elements on and + above the specified diagonal ``k``. + + Args: + x (usm_ndarray): + Input array + k (int, optional): + Specifies the diagonal below which to set + elements to zero. If ``k = 0``, the diagonal is the main diagonal. + If ``k < 0``, the diagonal is below the main diagonal. + If ``k > 0``, the diagonal is above the main diagonal. + Default: ``0`` + + Returns: + usm_ndarray: + An upper-triangular array or a stack of upper-triangular arrays. + """ + if not isinstance(x, dpt.usm_ndarray): + raise TypeError( + "Expected argument of type dpnp.tensor.usm_ndarray, " + f"got {type(x)}." + ) + + k = operator.index(k) + + order = "F" if (x.flags.f_contiguous) else "C" + + shape = x.shape + nd = x.ndim + if nd < 2: + raise ValueError("Array dimensions less than 2.") + + q = x.sycl_queue + if k > shape[nd - 1]: + res = dpt.zeros( + x.shape, + dtype=x.dtype, + order=order, + usm_type=x.usm_type, + sycl_queue=q, + ) + elif k <= -shape[nd - 2] + 1: + res = dpt.empty( + x.shape, + dtype=x.dtype, + order=order, + usm_type=x.usm_type, + sycl_queue=q, + ) + _manager = SequentialOrderManager[q] + dep_evs = _manager.submitted_events + hev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=x, dst=res, sycl_queue=q, depends=dep_evs + ) + _manager.add_event_pair(hev, cpy_ev) + else: + res = dpt.empty( + x.shape, + dtype=x.dtype, + order=order, + usm_type=x.usm_type, + sycl_queue=q, + ) + _manager = SequentialOrderManager[q] + dep_evs = _manager.submitted_events + hev, triu_ev = ti._triu( + src=x, dst=res, k=k, sycl_queue=q, depends=dep_evs + ) + _manager.add_event_pair(hev, triu_ev) + + return res + + +def zeros( + shape, + *, + dtype=None, + order="C", + device=None, + usm_type="device", + sycl_queue=None, +): + """ + Returns a new :class:`dpctl.tensor.usm_ndarray` having a specified + shape and filled with zeros. + + Args: + shape (Tuple[int], int): + Dimensions of the array to be created. + dtype (optional): + data type of the array. Can be typestring, + a :class:`numpy.dtype` object, :mod:`numpy` char string, + or a NumPy scalar type. Default: ``None`` + order ("C", or "F"): + memory layout for the array. Default: ``"C"`` + device (optional): array API concept of device where the output array + is created. ``device`` can be ``None``, a oneAPI filter selector + string, an instance of :class:`dpctl.SyclDevice` corresponding to + a non-partitioned SYCL device, an instance of + :class:`dpctl.SyclQueue`, or a :class:`dpctl.tensor.Device` object + returned by :attr:`dpctl.tensor.usm_ndarray.device`. + Default: ``None`` + usm_type (``"device"``, ``"shared"``, ``"host"``, optional): + The type of SYCL USM allocation for the output array. + Default: ``"device"`` + sycl_queue (:class:`dpctl.SyclQueue`, optional): + The SYCL queue to use + for output array allocation and copying. ``sycl_queue`` and + ``device`` are complementary arguments, i.e. use one or another. + If both are specified, a :exc:`TypeError` is raised unless both + imply the same underlying SYCL queue to be used. If both are + ``None``, a cached queue targeting default-selected device is + used for allocation and population. Default: ``None`` + + Returns: + usm_ndarray: + Constructed array initialized with zeros. + """ + if not isinstance(order, str) or len(order) == 0 or order[0] not in "CcFf": + raise ValueError( + "Unrecognized order keyword value, expecting 'F' or 'C'." + ) + order = order[0].upper() + dpt.validate_usm_type(usm_type, allow_none=False) + sycl_queue = normalize_queue_device(sycl_queue=sycl_queue, device=device) + dtype = _get_dtype(dtype, sycl_queue) + _ensure_native_dtype_device_support(dtype, sycl_queue.sycl_device) + res = dpt.usm_ndarray( + shape, + dtype=dtype, + buffer=usm_type, + order=order, + buffer_ctor_kwargs={"queue": sycl_queue}, + ) + _manager = SequentialOrderManager[sycl_queue] + # populating new allocation, no dependent events + hev, zeros_ev = ti._zeros_usm_ndarray(res, sycl_queue) + _manager.add_event_pair(hev, zeros_ev) + + return res + + +def zeros_like( + x, /, *, dtype=None, order="K", device=None, usm_type=None, sycl_queue=None +): + """ + Creates :class:`dpctl.tensor.usm_ndarray` from USM allocation + initialized with zeros. + + Args: + x (usm_ndarray): + Input array from which to derive the shape of the + output array. + dtype (optional): + data type of the array. Can be typestring, + a :class:`numpy.dtype` object, :mod:`numpy` char string, or a + NumPy scalar type. If `None`, output array has the same data + type as the input array. Default: ``None`` + order ("C", or "F"): + memory layout for the array. Default: ``"C"`` + device (optional): + array API concept of device where the output array + is created. ``device`` can be ``None``, a oneAPI filter selector + string, an instance of :class:`dpctl.SyclDevice` corresponding to + a non-partitioned SYCL device, an instance of + :class:`dpctl.SyclQueue`, or a :class:`dpctl.tensor.Device` object + returned by :attr:`dpctl.tensor.usm_ndarray.device`. + Default: ``None`` + usm_type (``"device"``, ``"shared"``, ``"host"``, optional): + The type of SYCL USM allocation for the output array. + Default: ``"device"`` + sycl_queue (:class:`dpctl.SyclQueue`, optional): + The SYCL queue to use + for output array allocation and copying. ``sycl_queue`` and + ``device`` are complementary arguments, i.e. use one or another. + If both are specified, a :exc:`TypeError` is raised unless both + imply the same underlying SYCL queue to be used. If both are + ``None``, a cached queue targeting default-selected device is + used for allocation and population. Default: ``None`` + + Returns: + usm_ndarray: + New array initialized with zeros. + """ + if not isinstance(x, dpt.usm_ndarray): + raise TypeError(f"Expected instance of dpt.usm_ndarray, got {type(x)}.") + if ( + not isinstance(order, str) + or len(order) == 0 + or order[0] not in "CcFfAaKk" + ): + raise ValueError( + "Unrecognized order keyword value, expecting 'C', 'F', 'A', or 'K'." + ) + order = order[0].upper() + if dtype is None: + dtype = x.dtype + if usm_type is None: + usm_type = x.usm_type + dpt.validate_usm_type(usm_type, allow_none=False) + if device is None and sycl_queue is None: + device = x.device + sycl_queue = normalize_queue_device(sycl_queue=sycl_queue, device=device) + dtype = dpt.dtype(dtype) + order = _normalize_order(order, x) + if order == "K": + _ensure_native_dtype_device_support(dtype, sycl_queue.sycl_device) + res = _empty_like_orderK(x, dtype, usm_type, sycl_queue) + _manager = SequentialOrderManager[sycl_queue] + # populating new allocation, no dependent events + hev, full_ev = ti._full_usm_ndarray(0, res, sycl_queue) + _manager.add_event_pair(hev, full_ev) + return res + else: + _ensure_native_dtype_device_support(dtype, sycl_queue.sycl_device) + sh = x.shape + return zeros( + sh, + dtype=dtype, + order=order, + device=device, + usm_type=usm_type, + sycl_queue=sycl_queue, + ) diff --git a/dpnp/tensor/_data_types.py b/dpnp/tensor/_data_types.py new file mode 100644 index 000000000000..faf30ffdabd0 --- /dev/null +++ b/dpnp/tensor/_data_types.py @@ -0,0 +1,104 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +from numpy import bool_ as np_bool_ +from numpy import complexfloating as np_complexfloating +from numpy import dtype +from numpy import floating as np_floating +from numpy import integer as np_integer +from numpy import issubdtype as np_issubdtype + +from ._tensor_impl import ( + default_device_bool_type as ti_default_device_bool_type, +) +from ._tensor_impl import ( + default_device_complex_type as ti_default_device_complex_type, +) +from ._tensor_impl import default_device_fp_type as ti_default_device_fp_type +from ._tensor_impl import default_device_int_type as ti_default_device_int_type + +bool = dtype("bool") +int8 = dtype("int8") +int16 = dtype("int16") +int32 = dtype("int32") +int64 = dtype("int64") +uint8 = dtype("uint8") +uint16 = dtype("uint16") +uint32 = dtype("uint32") +uint64 = dtype("uint64") +float16 = dtype("float16") +float32 = dtype("float32") +float64 = dtype("float64") +complex64 = dtype("complex64") +complex128 = dtype("complex128") + + +def _get_dtype(inp_dt, sycl_obj, ref_type=None): + """ + Type inference utility to construct data type + object with defaults based on reference type. + + _get_dtype is used by dpctl.tensor.asarray + to infer data type of the output array from the + input sequence. + """ + if inp_dt is None: + if ref_type in [None, float] or np_issubdtype(ref_type, np_floating): + fp_dt = ti_default_device_fp_type(sycl_obj) + return dtype(fp_dt) + if ref_type in [bool, np_bool_]: + bool_dt = ti_default_device_bool_type(sycl_obj) + return dtype(bool_dt) + if ref_type is int or np_issubdtype(ref_type, np_integer): + int_dt = ti_default_device_int_type(sycl_obj) + return dtype(int_dt) + if ref_type is complex or np_issubdtype(ref_type, np_complexfloating): + cfp_dt = ti_default_device_complex_type(sycl_obj) + return dtype(cfp_dt) + raise TypeError(f"Reference type {ref_type} not recognized.") + return dtype(inp_dt) + + +__all__ = [ + "dtype", + "_get_dtype", + "bool", + "int8", + "uint8", + "int16", + "uint16", + "int32", + "uint32", + "int64", + "uint64", + "float16", + "float32", + "float64", + "complex64", + "complex128", +] diff --git a/dpnp/tensor/_device.py b/dpnp/tensor/_device.py new file mode 100644 index 000000000000..5f2725c74855 --- /dev/null +++ b/dpnp/tensor/_device.py @@ -0,0 +1,197 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + + +import dpctl +from dpctl._sycl_device_factory import _cached_default_device +from dpctl._sycl_queue_manager import get_device_cached_queue + +from ._compute_follows_data import get_execution_queue + +__doc__ = "Implementation of array API mandated Device class" + + +class Device: + """ + An object representing Data-API concept of device. + + This is a wrapper around :class:`dpctl.SyclQueue` with custom + formatting. The class does not have public constructor, + but a class method :meth:`dpctl.tensor.Device.create_device` to construct + it from `device` keyword argument in Array-API functions. + + Instance can be queried for ``sycl_queue``, ``sycl_context``, + or ``sycl_device``. + """ + + __device_queue_map__ = {} + sycl_queue_ = None + + def __new__(cls, *args, **kwargs): + raise TypeError("No public constructor") + + @classmethod + def create_device(cls, device=None): + """Device.create_device(device=None) + + Creates instance of Device from argument. + + Args: + device: + Device specification, i.e. `None`, :class:`.Device`, + :class:`dpctl.SyclQueue`, or a :class:`dpctl.SyclDevice` + corresponding to a root SYCL device. + Raises: + ValueError: if an instance of :class:`dpctl.SycDevice` corresponding + to a sub-device was specified as the argument + SyclQueueCreationError: if :class:`dpctl.SyclQueue` could not be + created from the argument + """ + dev = device + obj = super().__new__(cls) + if isinstance(dev, Device): + obj.sycl_queue_ = dev.sycl_queue + elif isinstance(dev, dpctl.SyclQueue): + obj.sycl_queue_ = dev + elif isinstance(dev, dpctl.SyclDevice): + par = dev.parent_device + if par is None: + obj.sycl_queue_ = get_device_cached_queue(dev) + else: + raise ValueError( + f"Using non-root device {dev} to specify offloading " + "target is ambiguous. Please use dpctl.SyclQueue " + "targeting this device" + ) + else: + if dev is None: + _dev = _cached_default_device() + else: + _dev = dpctl.SyclDevice(dev) + obj.sycl_queue_ = get_device_cached_queue(_dev) + return obj + + @property + def sycl_queue(self): + """:class:`dpctl.SyclQueue` used to offload to this :class:`.Device`.""" + return self.sycl_queue_ + + @property + def sycl_context(self): + """:class:`dpctl.SyclContext` associated with this :class:`.Device`.""" + return self.sycl_queue_.sycl_context + + @property + def sycl_device(self): + """:class:`dpctl.SyclDevice` targeted by this :class:`.Device`.""" + return self.sycl_queue_.sycl_device + + def __repr__(self): + try: + sd = self.sycl_device + except AttributeError as exc: + raise ValueError( + f"Instance of {self.__class__} is not initialized" + ) from exc + try: + fs = sd.filter_string + return f"Device({fs})" + except TypeError: + # This is a sub-device + return repr(self.sycl_queue) + + def print_device_info(self): + """Outputs information about targeted SYCL device""" + self.sycl_device.print_device_info() + + def wait(self): + """Call ``wait`` method of the underlying ``sycl_queue``.""" + self.sycl_queue_.wait() + + def __eq__(self, other): + """Equality comparison based on underlying ``sycl_queue``.""" + if isinstance(other, Device): + return self.sycl_queue.__eq__(other.sycl_queue) + elif isinstance(other, dpctl.SyclQueue): + return self.sycl_queue.__eq__(other) + return False + + def __hash__(self): + """Compute object's hash value.""" + return self.sycl_queue.__hash__() + + +def normalize_queue_device(sycl_queue=None, device=None): + """normalize_queue_device(sycl_queue=None, device=None) + + Utility to process exclusive keyword arguments 'device' + and 'sycl_queue' in functions of `dpctl.tensor`. + + Args: + sycl_queue (:class:`dpctl.SyclQueue`, optional): + explicitly indicates where USM allocation is done + and the population code (if any) is executed. + Value `None` is interpreted as get the SYCL queue + from `device` keyword, or use default queue. + Default: None + device (string, :class:`dpctl.SyclDevice`, :class:`dpctl.SyclQueue, + :class:`dpctl.tensor.Device`, optional): + array-API keyword indicating non-partitioned SYCL device + where array is allocated. + + Returns + :class:`dpctl.SyclQueue` object implied by either of provided + keywords. If both are None, `dpctl.SyclQueue()` is returned. + If both are specified and imply the same queue, `sycl_queue` + is returned. + + Raises: + TypeError: if argument is not of the expected type, or keywords + imply incompatible queues. + """ + q = sycl_queue + d = device + if q is None: + d = Device.create_device(d) + return d.sycl_queue + if not isinstance(q, dpctl.SyclQueue): + raise TypeError(f"Expected dpctl.SyclQueue, got {type(q)}") + if d is None: + return q + d = Device.create_device(d) + qq = get_execution_queue( + ( + q, + d.sycl_queue, + ) + ) + if qq is None: + raise TypeError( + "sycl_queue and device keywords can not be both specified" + ) + return qq diff --git a/dpnp/tensor/_dldevice_conversions.py b/dpnp/tensor/_dldevice_conversions.py new file mode 100644 index 000000000000..595a280689a5 --- /dev/null +++ b/dpnp/tensor/_dldevice_conversions.py @@ -0,0 +1,52 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +from dpctl._sycl_device import SyclDevice + +from ._usmarray import DLDeviceType + + +def dldevice_to_sycl_device(dl_dev: tuple): + if isinstance(dl_dev, tuple): + if len(dl_dev) != 2: + raise ValueError("dldevice tuple must have length 2") + else: + raise TypeError( + f"dl_dev is expected to be a 2-tuple, got " f"{type(dl_dev)}" + ) + if dl_dev[0] != DLDeviceType.kDLOneAPI: + raise ValueError("dldevice type must be kDLOneAPI") + return SyclDevice(str(dl_dev[1])) + + +def sycl_device_to_dldevice(dev: SyclDevice): + if not isinstance(dev, SyclDevice): + raise TypeError( + "dev is expected to be a SyclDevice, got " f"{type(dev)}" + ) + return (DLDeviceType.kDLOneAPI, dev.get_device_id()) diff --git a/dpnp/tensor/_dlpack.pxd b/dpnp/tensor/_dlpack.pxd new file mode 100644 index 000000000000..75378bfa7a92 --- /dev/null +++ b/dpnp/tensor/_dlpack.pxd @@ -0,0 +1,73 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +# distutils: language = c++ +# cython: language_level=3 +# cython: linetrace=True + +cdef extern from "numpy/npy_no_deprecated_api.h": + pass +from dpctl._sycl_device cimport SyclDevice +from numpy cimport ndarray + +from ._usmarray cimport usm_ndarray + + +cdef extern from "dlpack/dlpack.h" nogil: + int device_CPU "kDLCPU" + int device_CUDA "kDLCUDA" + int device_CUDAHost "kDLCUDAHost" + int device_CUDAManaged "kDLCUDAManaged" + int device_DLROCM "kDLROCM" + int device_ROCMHost "kDLROCMHost" + int device_OpenCL "kDLOpenCL" + int device_Vulkan "kDLVulkan" + int device_Metal "kDLMetal" + int device_VPI "kDLVPI" + int device_OneAPI "kDLOneAPI" + int device_WebGPU "kDLWebGPU" + int device_Hexagon "kDLHexagon" + int device_MAIA "kDLMAIA" + int device_Trn "kDLTrn" + +cpdef object to_dlpack_capsule(usm_ndarray array) except + +cpdef object to_dlpack_versioned_capsule( + usm_ndarray array, bint copied +) except + +cpdef object numpy_to_dlpack_versioned_capsule( + ndarray array, bint copied +) except + +cpdef object from_dlpack_capsule(object dltensor) except + + +cdef class DLPackCreationError(Exception): + """ + A DLPackCreateError exception is raised when constructing + DLPack capsule from `usm_ndarray` based on a USM allocation + on a partitioned SYCL device. + """ + pass diff --git a/dpnp/tensor/_dlpack.pyx b/dpnp/tensor/_dlpack.pyx new file mode 100644 index 000000000000..947377d3a660 --- /dev/null +++ b/dpnp/tensor/_dlpack.pyx @@ -0,0 +1,1243 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +# distutils: language = c++ +# cython: language_level=3 +# cython: linetrace=True + +cdef extern from "numpy/npy_no_deprecated_api.h": + pass + +cimport cpython +cimport dpctl as c_dpctl +cimport dpctl.memory as c_dpmem +from dpctl._backend cimport ( + DPCTLDevice_Delete, + DPCTLDevice_GetParentDevice, + DPCTLSyclDeviceRef, + DPCTLSyclUSMRef, +) +from dpctl._sycl_queue_manager cimport get_device_cached_queue +from libc cimport stdlib +from libc.stdint cimport int64_t, uint8_t, uint16_t, uint32_t, uint64_t +from numpy cimport ndarray + +from ._usmarray cimport ( + USM_ARRAY_C_CONTIGUOUS, + USM_ARRAY_F_CONTIGUOUS, + USM_ARRAY_WRITABLE, + usm_ndarray, +) + +import ctypes + +import dpctl +import dpctl.memory as dpmem +import numpy as np + +from ._device import Device + + +cdef extern from "dlpack/dlpack.h" nogil: + cdef int DLPACK_MAJOR_VERSION + + cdef int DLPACK_MINOR_VERSION + + cdef int DLPACK_FLAG_BITMASK_READ_ONLY + + cdef int DLPACK_FLAG_BITMASK_IS_COPIED + + ctypedef struct DLPackVersion: + uint32_t major + uint32_t minor + + cdef enum DLDeviceType: + kDLCPU + kDLCUDA + kDLCUDAHost + kDLCUDAManaged + kDLROCM + kDLROCMHost + kDLOpenCL + kDLVulkan + kDLMetal + kDLVPI + kDLOneAPI + kDLWebGPU + kDLHexagon + kDLMAIA + kDLTrn + + ctypedef struct DLDevice: + DLDeviceType device_type + int device_id + + cdef enum DLDataTypeCode: + kDLInt + kDLUInt + kDLFloat + kDLBfloat + kDLComplex + kDLBool + kDLFloat8_e3m4 + kDLFloat8_e4m3 + kDLFloat8_e4m3b11fnuz + kDLFloat8_e4m3fn + kDLFloat8_e4m3fnuz + kDLFloat8_e5m2 + kDLFloat8_e5m2fnuz + kDLFloat8_e8m0fnu + kDLFloat6_e2m3fn + kDLFloat6_e3m2fn + kDLFloat4_e2m1fn + + ctypedef struct DLDataType: + uint8_t code + uint8_t bits + uint16_t lanes + + ctypedef struct DLTensor: + void *data + DLDevice device + int ndim + DLDataType dtype + int64_t *shape + int64_t *strides + uint64_t byte_offset + + ctypedef struct DLManagedTensor: + DLTensor dl_tensor + void *manager_ctx + void (*deleter)(DLManagedTensor *) # noqa: E211 + + ctypedef struct DLManagedTensorVersioned: + DLPackVersion version + void *manager_ctx + void (*deleter)(DLManagedTensorVersioned *) # noqa: E211 + uint64_t flags + DLTensor dl_tensor + + +def get_build_dlpack_version(): + """ + Returns a tuple of integers representing the `major` and `minor` + version of DLPack :module:`dpctl.tensor` was built with. + This tuple can be passed as the `max_version` argument to + `__dlpack__` to guarantee module:`dpctl.tensor` can properly + consume capsule. + + Returns: + Tuple[int, int] + A tuple of integers representing the `major` and `minor` + version of DLPack used to build :module:`dpctl.tensor`. + """ + return (DLPACK_MAJOR_VERSION, DLPACK_MINOR_VERSION) + + +cdef void _pycapsule_deleter(object dlt_capsule) noexcept: + cdef DLManagedTensor *dlm_tensor = NULL + if cpython.PyCapsule_IsValid(dlt_capsule, "dltensor"): + dlm_tensor = cpython.PyCapsule_GetPointer( + dlt_capsule, "dltensor") + dlm_tensor.deleter(dlm_tensor) + + +cdef void _managed_tensor_deleter( + DLManagedTensor *dlm_tensor +) noexcept with gil: + if dlm_tensor is not NULL: + # we only delete shape, because we make single allocation to + # accommodate both shape and strides if strides are needed + stdlib.free(dlm_tensor.dl_tensor.shape) + cpython.Py_DECREF(dlm_tensor.manager_ctx) + dlm_tensor.manager_ctx = NULL + stdlib.free(dlm_tensor) + + +cdef void _pycapsule_versioned_deleter(object dlt_capsule) noexcept: + cdef DLManagedTensorVersioned *dlmv_tensor = NULL + if cpython.PyCapsule_IsValid(dlt_capsule, "dltensor_versioned"): + dlmv_tensor = cpython.PyCapsule_GetPointer( + dlt_capsule, "dltensor_versioned") + dlmv_tensor.deleter(dlmv_tensor) + + +cdef void _managed_tensor_versioned_deleter( + DLManagedTensorVersioned *dlmv_tensor +) noexcept with gil: + if dlmv_tensor is not NULL: + # we only delete shape, because we make single allocation to + # accommodate both shape and strides if strides are needed + stdlib.free(dlmv_tensor.dl_tensor.shape) + cpython.Py_DECREF(dlmv_tensor.manager_ctx) + dlmv_tensor.manager_ctx = NULL + stdlib.free(dlmv_tensor) + + +cdef object _get_default_context(c_dpctl.SyclDevice dev): + try: + default_context = dev.sycl_platform.default_context + except RuntimeError: + # RT does not support default_context + default_context = None + + return default_context + +cdef int get_array_dlpack_device_id( + usm_ndarray usm_ary +) except -1: + """Finds ordinal number of the parent of device where array + was allocated. + """ + cdef c_dpctl.SyclQueue ary_sycl_queue + cdef c_dpctl.SyclDevice ary_sycl_device + cdef DPCTLSyclDeviceRef pDRef = NULL + cdef int device_id = -1 + + ary_sycl_queue = usm_ary.get_sycl_queue() + ary_sycl_device = ary_sycl_queue.get_sycl_device() + + default_context = _get_default_context(ary_sycl_device) + if default_context is None: + # check that ary_sycl_device is a non-partitioned device + pDRef = DPCTLDevice_GetParentDevice(ary_sycl_device.get_device_ref()) + if pDRef is not NULL: + DPCTLDevice_Delete(pDRef) + raise DLPackCreationError( + "to_dlpack_capsule: DLPack can only export arrays allocated " + "on non-partitioned SYCL devices on platforms where " + "default_context oneAPI extension is not supported." + ) + else: + if not usm_ary.sycl_context == default_context: + raise DLPackCreationError( + "to_dlpack_capsule: DLPack can only export arrays based on USM " + "allocations bound to a default platform SYCL context" + ) + device_id = ary_sycl_device.get_device_id() + + if device_id < 0: + raise DLPackCreationError( + "get_array_dlpack_device_id: failed to determine device_id" + ) + + return device_id + + +cpdef to_dlpack_capsule(usm_ndarray usm_ary): + """ + to_dlpack_capsule(usm_ary) + + Constructs named Python capsule object referencing + instance of ``DLManagedTensor`` from + :class:`dpctl.tensor.usm_ndarray` instance. + + Args: + usm_ary: An instance of :class:`dpctl.tensor.usm_ndarray` + Returns: + A new capsule with name ``"dltensor"`` that contains + a pointer to ``DLManagedTensor`` struct. + Raises: + DLPackCreationError: when array can be represented as + DLPack tensor. This may happen when array was allocated + on a partitioned sycl device, or its USM allocation is + not bound to the platform default SYCL context. + MemoryError: when host allocation to needed for ``DLManagedTensor`` + did not succeed. + ValueError: when array elements data type could not be represented + in ``DLManagedTensor``. + """ + cdef DLManagedTensor *dlm_tensor = NULL + cdef DLTensor *dl_tensor = NULL + cdef int nd = usm_ary.get_ndim() + cdef char *data_ptr = usm_ary.get_data() + cdef Py_ssize_t *shape_ptr = NULL + cdef Py_ssize_t *strides_ptr = NULL + cdef int64_t *shape_strides_ptr = NULL + cdef int i = 0 + cdef int device_id = -1 + cdef int flags = 0 + cdef Py_ssize_t element_offset = 0 + cdef Py_ssize_t byte_offset = 0 + cdef Py_ssize_t si = 1 + + ary_base = usm_ary.get_base() + + device_id = get_array_dlpack_device_id(usm_ary) + + dlm_tensor = stdlib.malloc( + sizeof(DLManagedTensor)) + if dlm_tensor is NULL: + raise MemoryError( + "to_dlpack_capsule: Could not allocate memory for DLManagedTensor" + ) + if nd > 0: + shape_strides_ptr = stdlib.malloc((sizeof(int64_t) * 2) * nd) + if shape_strides_ptr is NULL: + stdlib.free(dlm_tensor) + raise MemoryError( + "to_dlpack_capsule: Could not allocate memory for shape/strides" + ) + shape_ptr = usm_ary.get_shape() + for i in range(nd): + shape_strides_ptr[i] = shape_ptr[i] + strides_ptr = usm_ary.get_strides() + flags = usm_ary.flags_ + if strides_ptr: + for i in range(nd): + shape_strides_ptr[nd + i] = strides_ptr[i] + else: + if flags & USM_ARRAY_C_CONTIGUOUS: + si = 1 + for i in range(nd - 1, -1, -1): + shape_strides_ptr[nd + i] = si + si = si * shape_ptr[i] + elif flags & USM_ARRAY_F_CONTIGUOUS: + si = 1 + for i in range(0, nd): + shape_strides_ptr[nd + i] = si + si = si * shape_ptr[i] + else: + stdlib.free(shape_strides_ptr) + stdlib.free(dlm_tensor) + raise BufferError( + "to_dlpack_capsule: Invalid array encountered " + "when building strides" + ) + + strides_ptr = &shape_strides_ptr[nd] + + ary_dt = usm_ary.dtype + ary_dtk = ary_dt.kind + element_offset = usm_ary.get_offset() + byte_offset = element_offset * (ary_dt.itemsize) + + dl_tensor = &dlm_tensor.dl_tensor + dl_tensor.data = (data_ptr - byte_offset) + dl_tensor.ndim = nd + dl_tensor.byte_offset = byte_offset + dl_tensor.shape = &shape_strides_ptr[0] if nd > 0 else NULL + dl_tensor.strides = &shape_strides_ptr[nd] if nd > 0 else NULL + dl_tensor.device.device_type = kDLOneAPI + dl_tensor.device.device_id = device_id + dl_tensor.dtype.lanes = 1 + dl_tensor.dtype.bits = (ary_dt.itemsize * 8) + if (ary_dtk == "b"): + dl_tensor.dtype.code = kDLBool + elif (ary_dtk == "u"): + dl_tensor.dtype.code = kDLUInt + elif (ary_dtk == "i"): + dl_tensor.dtype.code = kDLInt + elif (ary_dtk == "f"): + dl_tensor.dtype.code = kDLFloat + elif (ary_dtk == "c"): + dl_tensor.dtype.code = kDLComplex + else: + stdlib.free(shape_strides_ptr) + stdlib.free(dlm_tensor) + raise ValueError("Unrecognized array data type") + + dlm_tensor.manager_ctx = ary_base + cpython.Py_INCREF(ary_base) + dlm_tensor.deleter = _managed_tensor_deleter + + return cpython.PyCapsule_New(dlm_tensor, "dltensor", _pycapsule_deleter) + + +cpdef to_dlpack_versioned_capsule(usm_ndarray usm_ary, bint copied): + """ + to_dlpack_versioned_capsule(usm_ary, copied) + + Constructs named Python capsule object referencing + instance of ``DLManagedTensorVersioned`` from + :class:`dpctl.tensor.usm_ndarray` instance. + + Args: + usm_ary: An instance of :class:`dpctl.tensor.usm_ndarray` + copied: A bint representing whether the data was previously + copied in order to set the flags with the is-copied + bitmask. + Returns: + A new capsule with name ``"dltensor_versioned"`` that + contains a pointer to ``DLManagedTensorVersioned`` struct. + Raises: + DLPackCreationError: when array can be represented as + DLPack tensor. This may happen when array was allocated + on a partitioned sycl device, or its USM allocation is + not bound to the platform default SYCL context. + MemoryError: when host allocation to needed for + ``DLManagedTensorVersioned`` did not succeed. + ValueError: when array elements data type could not be represented + in ``DLManagedTensorVersioned``. + """ + cdef DLManagedTensorVersioned *dlmv_tensor = NULL + cdef DLTensor *dl_tensor = NULL + cdef uint32_t dlmv_flags = 0 + cdef int nd = usm_ary.get_ndim() + cdef char *data_ptr = usm_ary.get_data() + cdef Py_ssize_t *shape_ptr = NULL + cdef Py_ssize_t *strides_ptr = NULL + cdef int64_t *shape_strides_ptr = NULL + cdef int i = 0 + cdef int device_id = -1 + cdef int flags = 0 + cdef Py_ssize_t element_offset = 0 + cdef Py_ssize_t byte_offset = 0 + cdef Py_ssize_t si = 1 + + ary_base = usm_ary.get_base() + + # Find ordinal number of the parent device + device_id = get_array_dlpack_device_id(usm_ary) + + dlmv_tensor = stdlib.malloc( + sizeof(DLManagedTensorVersioned)) + if dlmv_tensor is NULL: + raise MemoryError( + "to_dlpack_versioned_capsule: Could not allocate memory " + "for DLManagedTensorVersioned" + ) + if nd > 0: + shape_strides_ptr = stdlib.malloc((sizeof(int64_t) * 2) * nd) + if shape_strides_ptr is NULL: + stdlib.free(dlmv_tensor) + raise MemoryError( + "to_dlpack_versioned_capsule: Could not allocate memory " + "for shape/strides" + ) + # this can be a separate function for handling shapes and strides + shape_ptr = usm_ary.get_shape() + for i in range(nd): + shape_strides_ptr[i] = shape_ptr[i] + strides_ptr = usm_ary.get_strides() + flags = usm_ary.flags_ + if strides_ptr: + for i in range(nd): + shape_strides_ptr[nd + i] = strides_ptr[i] + else: + if flags & USM_ARRAY_C_CONTIGUOUS: + si = 1 + for i in range(nd - 1, -1, -1): + shape_strides_ptr[nd + i] = si + si = si * shape_ptr[i] + elif flags & USM_ARRAY_F_CONTIGUOUS: + si = 1 + for i in range(0, nd): + shape_strides_ptr[nd + i] = si + si = si * shape_ptr[i] + else: + stdlib.free(shape_strides_ptr) + stdlib.free(dlmv_tensor) + raise BufferError( + "to_dlpack_versioned_capsule: Invalid array encountered " + "when building strides" + ) + + strides_ptr = &shape_strides_ptr[nd] + + # this can all be a function for building the dl_tensor + # object (separate from dlm/dlmv) + ary_dt = usm_ary.dtype + ary_dtk = ary_dt.kind + element_offset = usm_ary.get_offset() + byte_offset = element_offset * (ary_dt.itemsize) + + dl_tensor = &dlmv_tensor.dl_tensor + dl_tensor.data = (data_ptr - byte_offset) + dl_tensor.ndim = nd + dl_tensor.byte_offset = byte_offset + dl_tensor.shape = &shape_strides_ptr[0] if nd > 0 else NULL + dl_tensor.strides = &shape_strides_ptr[nd] if nd > 0 else NULL + dl_tensor.device.device_type = kDLOneAPI + dl_tensor.device.device_id = device_id + dl_tensor.dtype.lanes = 1 + dl_tensor.dtype.bits = (ary_dt.itemsize * 8) + if (ary_dtk == "b"): + dl_tensor.dtype.code = kDLBool + elif (ary_dtk == "u"): + dl_tensor.dtype.code = kDLUInt + elif (ary_dtk == "i"): + dl_tensor.dtype.code = kDLInt + elif (ary_dtk == "f"): + dl_tensor.dtype.code = kDLFloat + elif (ary_dtk == "c"): + dl_tensor.dtype.code = kDLComplex + else: + stdlib.free(shape_strides_ptr) + stdlib.free(dlmv_tensor) + raise ValueError("Unrecognized array data type") + + # set flags down here + if copied: + dlmv_flags |= DLPACK_FLAG_BITMASK_IS_COPIED + if not (flags & USM_ARRAY_WRITABLE): + dlmv_flags |= DLPACK_FLAG_BITMASK_READ_ONLY + dlmv_tensor.flags = dlmv_flags + + dlmv_tensor.version.major = DLPACK_MAJOR_VERSION + dlmv_tensor.version.minor = DLPACK_MINOR_VERSION + + dlmv_tensor.manager_ctx = ary_base + cpython.Py_INCREF(ary_base) + dlmv_tensor.deleter = _managed_tensor_versioned_deleter + + return cpython.PyCapsule_New( + dlmv_tensor, "dltensor_versioned", _pycapsule_versioned_deleter + ) + + +cpdef numpy_to_dlpack_versioned_capsule(ndarray npy_ary, bint copied): + """ + to_dlpack_versioned_capsule(npy_ary, copied) + + Constructs named Python capsule object referencing + instance of ``DLManagedTensorVersioned`` from + :class:`numpy.ndarray` instance. + + Args: + npy_ary: An instance of :class:`numpy.ndarray` + copied: A bint representing whether the data was previously + copied in order to set the flags with the is-copied + bitmask. + Returns: + A new capsule with name ``"dltensor_versioned"`` that + contains a pointer to ``DLManagedTensorVersioned`` struct. + Raises: + DLPackCreationError: when array can be represented as + DLPack tensor. + MemoryError: when host allocation to needed for + ``DLManagedTensorVersioned`` did not succeed. + ValueError: when array elements data type could not be represented + in ``DLManagedTensorVersioned``. + """ + cdef DLManagedTensorVersioned *dlmv_tensor = NULL + cdef DLTensor *dl_tensor = NULL + cdef uint32_t dlmv_flags = 0 + cdef int nd = npy_ary.ndim + cdef int64_t *shape_strides_ptr = NULL + cdef int i = 0 + cdef Py_ssize_t byte_offset = 0 + cdef int itemsize = npy_ary.itemsize + + dlmv_tensor = stdlib.malloc( + sizeof(DLManagedTensorVersioned)) + if dlmv_tensor is NULL: + raise MemoryError( + "numpy_to_dlpack_versioned_capsule: Could not allocate memory " + "for DLManagedTensorVersioned" + ) + + shape = npy_ary.ctypes.shape_as(ctypes.c_int64) + strides = npy_ary.ctypes.strides_as(ctypes.c_int64) + if nd > 0: + if npy_ary.size != 1: + for i in range(nd): + if shape[i] != 1 and strides[i] % itemsize != 0: + stdlib.free(dlmv_tensor) + raise BufferError( + "numpy_to_dlpack_versioned_capsule: DLPack cannot " + "encode an array if strides are not a multiple of " + "itemsize" + ) + shape_strides_ptr = stdlib.malloc((sizeof(int64_t) * 2) * nd) + if shape_strides_ptr is NULL: + stdlib.free(dlmv_tensor) + raise MemoryError( + "numpy_to_dlpack_versioned_capsule: Could not allocate memory " + "for shape/strides" + ) + for i in range(nd): + shape_strides_ptr[i] = shape[i] + shape_strides_ptr[nd + i] = strides[i] // itemsize + + writable_flag = npy_ary.flags["W"] + + ary_dt = npy_ary.dtype + ary_dtk = ary_dt.kind + + dl_tensor = &dlmv_tensor.dl_tensor + dl_tensor.data = npy_ary.data + dl_tensor.ndim = nd + dl_tensor.byte_offset = byte_offset + dl_tensor.shape = &shape_strides_ptr[0] if nd > 0 else NULL + dl_tensor.strides = &shape_strides_ptr[nd] if nd > 0 else NULL + dl_tensor.device.device_type = kDLCPU + dl_tensor.device.device_id = 0 + dl_tensor.dtype.lanes = 1 + dl_tensor.dtype.bits = (ary_dt.itemsize * 8) + if (ary_dtk == "b"): + dl_tensor.dtype.code = kDLBool + elif (ary_dtk == "u"): + dl_tensor.dtype.code = kDLUInt + elif (ary_dtk == "i"): + dl_tensor.dtype.code = kDLInt + elif (ary_dtk == "f" and ary_dt.itemsize <= 8): + dl_tensor.dtype.code = kDLFloat + elif (ary_dtk == "c" and ary_dt.itemsize <= 16): + dl_tensor.dtype.code = kDLComplex + else: + stdlib.free(shape_strides_ptr) + stdlib.free(dlmv_tensor) + raise ValueError("Unrecognized array data type") + + # set flags down here + if copied: + dlmv_flags |= DLPACK_FLAG_BITMASK_IS_COPIED + if not writable_flag: + dlmv_flags |= DLPACK_FLAG_BITMASK_READ_ONLY + dlmv_tensor.flags = dlmv_flags + + dlmv_tensor.version.major = DLPACK_MAJOR_VERSION + dlmv_tensor.version.minor = DLPACK_MINOR_VERSION + + dlmv_tensor.manager_ctx = npy_ary + cpython.Py_INCREF(npy_ary) + dlmv_tensor.deleter = _managed_tensor_versioned_deleter + + return cpython.PyCapsule_New( + dlmv_tensor, "dltensor_versioned", _pycapsule_versioned_deleter + ) + + +cdef class _DLManagedTensorOwner: + """ + Helper class managing the lifetime of the DLManagedTensor struct + transferred from a 'dlpack' capsule. + """ + cdef DLManagedTensor * dlm_tensor + + def __cinit__(self): + self.dlm_tensor = NULL + + def __dealloc__(self): + if self.dlm_tensor: + self.dlm_tensor.deleter(self.dlm_tensor) + self.dlm_tensor = NULL + + @staticmethod + cdef _DLManagedTensorOwner _create(DLManagedTensor *dlm_tensor_src): + cdef _DLManagedTensorOwner res + res = _DLManagedTensorOwner.__new__(_DLManagedTensorOwner) + res.dlm_tensor = dlm_tensor_src + return res + + +cdef class _DLManagedTensorVersionedOwner: + """ + Helper class managing the lifetime of the DLManagedTensorVersioned + struct transferred from a 'dlpack_versioned' capsule. + """ + cdef DLManagedTensorVersioned * dlmv_tensor + + def __cinit__(self): + self.dlmv_tensor = NULL + + def __dealloc__(self): + if self.dlmv_tensor: + self.dlmv_tensor.deleter(self.dlmv_tensor) + self.dlmv_tensor = NULL + + @staticmethod + cdef _DLManagedTensorVersionedOwner _create( + DLManagedTensorVersioned *dlmv_tensor_src + ): + cdef _DLManagedTensorVersionedOwner res + res = _DLManagedTensorVersionedOwner.__new__( + _DLManagedTensorVersionedOwner + ) + res.dlmv_tensor = dlmv_tensor_src + return res + + +cdef dict _numpy_array_interface_from_dl_tensor(DLTensor *dlt, bint ro_flag): + """Constructs a NumPy `__array_interface__` dictionary from a DLTensor.""" + cdef int itemsize = 0 + + if dlt.dtype.lanes != 1: + raise BufferError( + "Can not import DLPack tensor with lanes != 1" + ) + itemsize = dlt.dtype.bits // 8 + shape = list() + if (dlt.strides is NULL): + strides = None + for dim in range(dlt.ndim): + shape.append(dlt.shape[dim]) + else: + strides = list() + for dim in range(dlt.ndim): + shape.append(dlt.shape[dim]) + # convert to byte-strides + strides.append(dlt.strides[dim] * itemsize) + strides = tuple(strides) + shape = tuple(shape) + if (dlt.dtype.code == kDLUInt): + ary_dt = "u" + str(itemsize) + elif (dlt.dtype.code == kDLInt): + ary_dt = "i" + str(itemsize) + elif (dlt.dtype.code == kDLFloat): + ary_dt = "f" + str(itemsize) + elif (dlt.dtype.code == kDLComplex): + ary_dt = "c" + str(itemsize) + elif (dlt.dtype.code == kDLBool): + ary_dt = "b" + str(itemsize) + else: + raise BufferError( + "Can not import DLPack tensor with type code {}.".format( + dlt.dtype.code + ) + ) + typestr = "|" + ary_dt + return dict( + version=3, + shape=shape, + strides=strides, + data=( dlt.data, True if ro_flag else False), + offset=dlt.byte_offset, + typestr=typestr, + ) + + +class _numpy_array_interface_wrapper: + """ + Class that wraps a Python capsule and dictionary for consumption by NumPy. + + Implementation taken from + https://github.com/dmlc/dlpack/blob/main/apps/numpy_dlpack/dlpack/to_numpy.py + + Args: + array_interface: + A dictionary describing the underlying memory. Formatted + to match `numpy.ndarray.__array_interface__`. + + pycapsule: + A Python capsule wrapping the dlpack tensor that will be + converted to numpy. + """ + + def __init__(self, array_interface, memory_owner) -> None: + self.__array_interface__ = array_interface + self._memory_owner = memory_owner + + +cdef bint _is_kdlcpu_device(DLDevice *dev): + "Check if DLTensor.DLDevice denotes (kDLCPU, 0)" + return (dev[0].device_type == kDLCPU) and (dev[0].device_id == 0) + + +cpdef object from_dlpack_capsule(object py_caps): + """ + from_dlpack_capsule(py_caps) + + Reconstructs instance of :class:`dpctl.tensor.usm_ndarray` from + named Python capsule object referencing instance of ``DLManagedTensor`` + without copy. The instance forms a view in the memory of the tensor. + + Args: + caps: + Python capsule with name ``"dltensor"`` expected to reference + an instance of ``DLManagedTensor`` struct. + Returns: + Instance of :class:`dpctl.tensor.usm_ndarray` with a view into + memory of the tensor. Capsule is renamed to ``"used_dltensor"`` + upon success. + Raises: + TypeError: + if argument is not a ``"dltensor"`` capsule. + ValueError: + if argument is ``"used_dltensor"`` capsule + BufferError: + if the USM pointer is not bound to the reconstructed + sycl context, or the DLPack's device_type is not supported + by :mod:`dpctl`. + """ + cdef DLManagedTensorVersioned *dlmv_tensor = NULL + cdef DLManagedTensor *dlm_tensor = NULL + cdef DLTensor *dl_tensor = NULL + cdef int versioned = 0 + cdef int readonly = 0 + cdef bytes usm_type + cdef size_t sz = 1 + cdef size_t alloc_sz = 1 + cdef int i + cdef int device_id = -1 + cdef int element_bytesize = 0 + cdef Py_ssize_t offset_min = 0 + cdef Py_ssize_t offset_max = 0 + cdef char *mem_ptr = NULL + cdef Py_ssize_t mem_ptr_delta = 0 + cdef Py_ssize_t element_offset = 0 + cdef int64_t stride_i = -1 + cdef int64_t shape_i = -1 + + if cpython.PyCapsule_IsValid(py_caps, "dltensor"): + dlm_tensor = cpython.PyCapsule_GetPointer( + py_caps, "dltensor") + dl_tensor = &dlm_tensor.dl_tensor + elif cpython.PyCapsule_IsValid(py_caps, "dltensor_versioned"): + dlmv_tensor = cpython.PyCapsule_GetPointer( + py_caps, "dltensor_versioned") + if dlmv_tensor.version.major > DLPACK_MAJOR_VERSION: + raise BufferError( + "Can not import DLPack tensor with major version " + f"greater than {DLPACK_MAJOR_VERSION}" + ) + versioned = 1 + readonly = (dlmv_tensor.flags & DLPACK_FLAG_BITMASK_READ_ONLY) != 0 + dl_tensor = &dlmv_tensor.dl_tensor + elif ( + cpython.PyCapsule_IsValid(py_caps, "used_dltensor") + or cpython.PyCapsule_IsValid(py_caps, "used_dltensor_versioned") + ): + raise ValueError( + "A DLPack tensor object can not be consumed multiple times" + ) + else: + raise TypeError( + "`from_dlpack_capsule` expects a Python 'dltensor' capsule" + ) + + # Verify that we can work with this device + if dl_tensor.device.device_type == kDLOneAPI: + device_id = dl_tensor.device.device_id + root_device = dpctl.SyclDevice(str(device_id)) + try: + default_context = root_device.sycl_platform.default_context + except RuntimeError: + default_context = get_device_cached_queue(root_device).sycl_context + if dl_tensor.data is NULL: + usm_type = b"device" + q = get_device_cached_queue((default_context, root_device,)) + else: + usm_type = c_dpmem._Memory.get_pointer_type( + dl_tensor.data, + default_context) + if usm_type == b"unknown": + raise BufferError( + "Data pointer in DLPack is not bound to default sycl " + f"context of device '{device_id}', translated to " + f"{root_device.filter_string}" + ) + alloc_device = c_dpmem._Memory.get_pointer_device( + dl_tensor.data, + default_context + ) + q = get_device_cached_queue((default_context, alloc_device,)) + if dl_tensor.dtype.bits % 8: + raise BufferError( + "Can not import DLPack tensor whose element's " + "bitsize is not a multiple of 8" + ) + if dl_tensor.dtype.lanes != 1: + raise BufferError( + "Can not import DLPack tensor with lanes != 1" + ) + if dl_tensor.ndim > 0: + offset_min = 0 + offset_max = 0 + for i in range(dl_tensor.ndim): + stride_i = dl_tensor.strides[i] + shape_i = dl_tensor.shape[i] + if shape_i > 1: + shape_i -= 1 + if stride_i > 0: + offset_max = offset_max + stride_i * shape_i + else: + offset_min = offset_min + stride_i * shape_i + sz = offset_max - offset_min + 1 + if sz == 0: + sz = 1 + + element_bytesize = (dl_tensor.dtype.bits // 8) + sz = sz * element_bytesize + element_offset = dl_tensor.byte_offset // element_bytesize + + # transfer ownership + if not versioned: + dlm_holder = _DLManagedTensorOwner._create(dlm_tensor) + cpython.PyCapsule_SetName(py_caps, "used_dltensor") + else: + dlmv_holder = _DLManagedTensorVersionedOwner._create(dlmv_tensor) + cpython.PyCapsule_SetName(py_caps, "used_dltensor_versioned") + + if dl_tensor.data is NULL: + usm_mem = dpmem.MemoryUSMDevice(sz, q) + else: + mem_ptr_delta = dl_tensor.byte_offset - ( + element_offset * element_bytesize + ) + mem_ptr = dl_tensor.data + alloc_sz = dl_tensor.byte_offset + ( + (offset_max + 1) * element_bytesize) + tmp = c_dpmem._Memory.create_from_usm_pointer_size_qref( + mem_ptr, + max(alloc_sz, element_bytesize), + (q).get_queue_ref(), + memory_owner=dlmv_holder if versioned else dlm_holder + ) + if mem_ptr_delta == 0: + usm_mem = tmp + else: + alloc_sz = dl_tensor.byte_offset + ( + (offset_max * element_bytesize + mem_ptr_delta)) + usm_mem = c_dpmem._Memory.create_from_usm_pointer_size_qref( + ( + mem_ptr + (element_bytesize - mem_ptr_delta) + ), + max(alloc_sz, element_bytesize), + (q).get_queue_ref(), + memory_owner=tmp + ) + + py_shape = list() + if (dl_tensor.shape is not NULL): + for i in range(dl_tensor.ndim): + py_shape.append(dl_tensor.shape[i]) + if (dl_tensor.strides is not NULL): + py_strides = list() + for i in range(dl_tensor.ndim): + py_strides.append(dl_tensor.strides[i]) + else: + py_strides = None + if (dl_tensor.dtype.code == kDLUInt): + ary_dt = np.dtype("u" + str(element_bytesize)) + elif (dl_tensor.dtype.code == kDLInt): + ary_dt = np.dtype("i" + str(element_bytesize)) + elif (dl_tensor.dtype.code == kDLFloat): + ary_dt = np.dtype("f" + str(element_bytesize)) + elif (dl_tensor.dtype.code == kDLComplex): + ary_dt = np.dtype("c" + str(element_bytesize)) + elif (dl_tensor.dtype.code == kDLBool): + ary_dt = np.dtype("?") + else: + raise BufferError( + "Can not import DLPack tensor with type code {}.".format( + dl_tensor.dtype.code + ) + ) + res_ary = usm_ndarray( + py_shape, + dtype=ary_dt, + buffer=usm_mem, + strides=py_strides, + offset=element_offset + ) + if readonly: + res_ary.flags_ = (res_ary.flags_ & ~USM_ARRAY_WRITABLE) + return res_ary + elif _is_kdlcpu_device(&dl_tensor.device): + ary_iface = _numpy_array_interface_from_dl_tensor(dl_tensor, readonly) + if not versioned: + dlm_holder = _DLManagedTensorOwner._create(dlm_tensor) + cpython.PyCapsule_SetName(py_caps, "used_dltensor") + return np.ctypeslib.as_array( + _numpy_array_interface_wrapper(ary_iface, dlm_holder) + ) + else: + dlmv_holder = _DLManagedTensorVersionedOwner._create(dlmv_tensor) + cpython.PyCapsule_SetName(py_caps, "used_dltensor_versioned") + return np.ctypeslib.as_array( + _numpy_array_interface_wrapper(ary_iface, dlmv_holder) + ) + else: + raise BufferError( + "The DLPack tensor resides on unsupported device." + ) + +cdef usm_ndarray _to_usm_ary_from_host_blob(object host_blob, dev : Device): + q = dev.sycl_queue + np_ary = np.asarray(host_blob) + dt = np_ary.dtype + if dt.char in "dD" and q.sycl_device.has_aspect_fp64 is False: + Xusm_dtype = ( + "float32" if dt.char == "d" else "complex64" + ) + else: + Xusm_dtype = dt + usm_mem = dpmem.MemoryUSMDevice(np_ary.nbytes, queue=q) + usm_ary = usm_ndarray(np_ary.shape, dtype=Xusm_dtype, buffer=usm_mem) + usm_mem.copy_from_host(np.reshape(np_ary.view(dtype="u1"), -1)) + return usm_ary + + +# only cdef to make it private +cdef object _create_device(object device, object dl_device): + if isinstance(device, Device): + return device + elif isinstance(device, dpctl.SyclDevice): + return Device.create_device(device) + else: + root_device = dpctl.SyclDevice(str(dl_device[1])) + return Device.create_device(root_device) + + +def from_dlpack(x, /, *, device=None, copy=None): + """from_dlpack(x, /, *, device=None, copy=None) + + Constructs :class:`dpctl.tensor.usm_ndarray` or :class:`numpy.ndarray` + instance from a Python object ``x`` that implements ``__dlpack__`` protocol. + + Args: + x (object): + A Python object representing an array that supports + ``__dlpack__`` protocol. + device ( + Optional[str, :class:`dpctl.SyclDevice`, + :class:`dpctl.SyclQueue`, + :class:`dpctl.tensor.Device`, + tuple([:class:`enum.IntEnum`, int])]) + ): + Device where the output array is to be placed. ``device`` keyword + values can be: + + * ``None`` + The data remains on the same device. + * oneAPI filter selector string + SYCL device selected by :ref:`filter selector string + `. + * :class:`dpctl.SyclDevice` + explicit SYCL device that must correspond to + a non-partitioned SYCL device. + * :class:`dpctl.SyclQueue` + implies SYCL device targeted by the SYCL queue. + * :class:`dpctl.tensor.Device` + implies SYCL device `device.sycl_queue`. The `Device` object + is obtained via :attr:`dpctl.tensor.usm_ndarray.device`. + * ``(device_type, device_id)`` + 2-tuple matching the format of the output of the + ``__dlpack_device__`` method: an integer enumerator representing + the device type followed by an integer representing the index of + the device. The only supported :class:`dpctl.tensor.DLDeviceType` + device types are ``"kDLCPU"`` and ``"kDLOneAPI"``. + + Default: ``None``. + + copy (bool, optional) + Boolean indicating whether or not to copy the input. + + * If ``copy`` is ``True``, the input will always be + copied. + * If ``False``, a ``BufferError`` will be raised if a + copy is deemed necessary. + * If ``None``, a copy will be made only if deemed + necessary, otherwise, the existing memory buffer will + be reused. + + Default: ``None``. + + Returns: + Alternative[usm_ndarray, numpy.ndarray]: + An array containing the data in ``x``. When ``copy`` is + ``None`` or ``False``, this may be a view into the original + memory. + + The type of the returned object + depends on where the data backing up input object ``x`` resides. + If it resides in a USM allocation on a SYCL device, the + type :class:`dpctl.tensor.usm_ndarray` is returned, otherwise if it + resides on ``"kDLCPU"`` device the type is :class:`numpy.ndarray`, + and otherwise an exception is raised. + + .. note:: + + If the return type is :class:`dpctl.tensor.usm_ndarray`, the + associated SYCL queue is derived from the ``device`` keyword. + When ``device`` keyword value has type :class:`dpctl.SyclQueue`, + the explicit queue instance is used, when ``device`` keyword + value has type :class:`dpctl.tensor.Device`, the + ``device.sycl_queue`` is used. In all other cases, the cached + SYCL queue corresponding to the implied SYCL device is used. + + Raises: + TypeError: + if ``x`` does not implement ``__dlpack__`` method + ValueError: + if data of the input object resides on an unsupported device + + See https://dmlc.github.io/dlpack/latest/ for more details. + + :Example: + + .. code-block:: python + + import dpctl + import dpnp.tensor as dpt + + class Container: + "Helper class implementing `__dlpack__` protocol" + def __init__(self, array): + self._array = array + + def __dlpack__(self, stream=None): + return self._array.__dlpack__(stream=stream) + + def __dlpack_device__(self): + return self._array.__dlpack_device__() + + C = Container(dpt.linspace(0, 100, num=20, dtype="int16")) + # create usm_ndarray view + X = dpt.from_dlpack(C) + # migrate content of the container to device of type kDLCPU + Y = dpt.from_dlpack(C, device=(dpt.DLDeviceType.kDLCPU, 0)) + + """ + dlpack_attr = getattr(x, "__dlpack__", None) + dlpack_dev_attr = getattr(x, "__dlpack_device__", None) + if not callable(dlpack_attr) or not callable(dlpack_dev_attr): + raise TypeError( + f"The argument of type {type(x)} does not implement " + "`__dlpack__` and `__dlpack_device__` methods." + ) + # device is converted to a dlpack_device if necessary + dl_device = None + if device: + if isinstance(device, tuple): + dl_device = device + if len(dl_device) != 2: + raise ValueError( + "Argument `device` specified as a tuple must have length 2" + ) + else: + if not isinstance(device, dpctl.SyclDevice): + device = Device.create_device(device) + d = device.sycl_device + else: + d = device + dl_device = (device_OneAPI, d.get_device_id()) + if dl_device is not None: + if (dl_device[0] not in [device_OneAPI, device_CPU]): + raise ValueError( + f"Argument `device`={device} is not supported." + ) + got_type_error = False + got_buffer_error = False + got_other_error = False + saved_exception = None + # First DLPack version supporting dl_device, and copy + requested_ver = (1, 0) + cpu_dev = (device_CPU, 0) + try: + # setting max_version to minimal version that supports + # dl_device/copy keywords + dlpack_capsule = dlpack_attr( + max_version=requested_ver, + dl_device=dl_device, + copy=copy + ) + except TypeError: + # exporter does not support max_version keyword + got_type_error = True + except (BufferError, NotImplementedError, ValueError) as e: + # Either dl_device, or copy cannot be satisfied + got_buffer_error = True + saved_exception = e + except Exception as e: + got_other_error = True + saved_exception = e + else: + # execution did not raise exceptions + return from_dlpack_capsule(dlpack_capsule) + finally: + if got_type_error: + # max_version/dl_device, copy keywords are not supported + # by __dlpack__ + x_dldev = dlpack_dev_attr() + if (dl_device is None) or (dl_device == x_dldev): + dlpack_capsule = dlpack_attr() + return from_dlpack_capsule(dlpack_capsule) + # must copy via host + if copy is False: + raise BufferError( + "Importing data via DLPack requires copying, but " + "copy=False was provided" + ) + # when max_version/dl_device/copy are not supported + # we can only support importing to OneAPI devices + # from host, or from another oneAPI device + is_supported_x_dldev = ( + x_dldev == cpu_dev or + (x_dldev[0] == device_OneAPI) + ) + is_supported_dl_device = ( + dl_device == cpu_dev or + dl_device[0] == device_OneAPI + ) + if is_supported_x_dldev and is_supported_dl_device: + dlpack_capsule = dlpack_attr() + blob = from_dlpack_capsule(dlpack_capsule) + else: + raise BufferError( + f"Can not import to requested device {dl_device}" + ) + dev = _create_device(device, dl_device) + if x_dldev == cpu_dev and dl_device == cpu_dev: + # both source and destination are CPU + return blob + elif x_dldev == cpu_dev: + # source is CPU, destination is oneAPI + return _to_usm_ary_from_host_blob(blob, dev) + elif dl_device == cpu_dev: + # source is oneAPI, destination is CPU + cpu_caps = blob.__dlpack__( + max_version=get_build_dlpack_version(), + dl_device=cpu_dev + ) + return from_dlpack_capsule(cpu_caps) + else: + import dpnp.tensor as dpt + return dpt.asarray(blob, device=dev) + elif got_buffer_error: + # we are here, because dlpack_attr could not deal with requested + # dl_device, or copying was required + if copy is False: + raise BufferError( + "Importing data via DLPack requires copying, but " + "copy=False was provided" + ) + if dl_device is None: + raise saved_exception + # must copy via host + if dl_device[0] != device_OneAPI: + raise BufferError( + f"Can not import to requested device {dl_device}" + ) + x_dldev = dlpack_dev_attr() + if x_dldev == cpu_dev: + dlpack_capsule = dlpack_attr() + host_blob = from_dlpack_capsule(dlpack_capsule) + else: + dlpack_capsule = dlpack_attr( + max_version=requested_ver, + dl_device=cpu_dev, + copy=copy + ) + host_blob = from_dlpack_capsule(dlpack_capsule) + dev = _create_device(device, dl_device) + return _to_usm_ary_from_host_blob(host_blob, dev) + elif got_other_error: + raise saved_exception diff --git a/dpnp/tensor/_elementwise_common.py b/dpnp/tensor/_elementwise_common.py new file mode 100644 index 000000000000..2eb89b8fb5f8 --- /dev/null +++ b/dpnp/tensor/_elementwise_common.py @@ -0,0 +1,988 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +from dpctl.utils import SequentialOrderManager + +import dpnp.tensor as dpt +import dpnp.tensor._tensor_impl as ti + +from ._copy_utils import _empty_like_orderK, _empty_like_pair_orderK +from ._manipulation_functions import _broadcast_shape_impl +from ._scalar_utils import ( + _get_dtype, + _get_queue_usm_type, + _get_shape, + _validate_dtype, +) +from ._type_utils import ( + _acceptance_fn_default_binary, + _acceptance_fn_default_unary, + _all_data_types, + _find_buf_dtype, + _find_buf_dtype2, + _find_buf_dtype_in_place_op, + _resolve_weak_types, +) + + +class UnaryElementwiseFunc: + """ + Class that implements unary element-wise functions. + + Args: + name (str): + Name of the unary function + result_type_resovler_fn (callable): + Function that takes dtype of the input and + returns the dtype of the result if the + implementation functions supports it, or + returns `None` otherwise. + unary_dp_impl_fn (callable): + Data-parallel implementation function with signature + `impl_fn(src: usm_ndarray, dst: usm_ndarray, + sycl_queue: SyclQueue, depends: Optional[List[SyclEvent]])` + where the `src` is the argument array, `dst` is the + array to be populated with function values, effectively + evaluating `dst = func(src)`. + The `impl_fn` is expected to return a 2-tuple of `SyclEvent`s. + The first event corresponds to data-management host tasks, + including lifetime management of argument Python objects to ensure + that their associated USM allocation is not freed before offloaded + computational tasks complete execution, while the second event + corresponds to computational tasks associated with function + evaluation. + acceptance_fn (callable, optional): + Function to influence type promotion behavior of this unary + function. The function takes 4 arguments: + arg_dtype - Data type of the first argument + buf_dtype - Data type the argument would be cast to + res_dtype - Data type of the output array with function values + sycl_dev - The :class:`dpctl.SyclDevice` where the function + evaluation is carried out. + The function is invoked when the argument of the unary function + requires casting, e.g. the argument of `dpctl.tensor.log` is an + array with integral data type. + docs (str): + Documentation string for the unary function. + """ + + def __init__( + self, + name, + result_type_resolver_fn, + unary_dp_impl_fn, + docs, + acceptance_fn=None, + ): + self.__name__ = "UnaryElementwiseFunc" + self.name_ = name + self.result_type_resolver_fn_ = result_type_resolver_fn + self.types_ = None + self.unary_fn_ = unary_dp_impl_fn + self.__doc__ = docs + if callable(acceptance_fn): + self.acceptance_fn_ = acceptance_fn + else: + self.acceptance_fn_ = _acceptance_fn_default_unary + + def __str__(self): + return f"<{self.__name__} '{self.name_}'>" + + def __repr__(self): + return f"<{self.__name__} '{self.name_}'>" + + def get_implementation_function(self): + """Returns the implementation function for + this elementwise unary function. + + """ + return self.unary_fn_ + + def get_type_result_resolver_function(self): + """Returns the type resolver function for this + elementwise unary function. + """ + return self.result_type_resolver_fn_ + + def get_type_promotion_path_acceptance_function(self): + """Returns the acceptance function for this + elementwise binary function. + + Acceptance function influences the type promotion + behavior of this unary function. + The function takes 4 arguments: + arg_dtype - Data type of the first argument + buf_dtype - Data type the argument would be cast to + res_dtype - Data type of the output array with function values + sycl_dev - The :class:`dpctl.SyclDevice` where the function + evaluation is carried out. + The function is invoked when the argument of the unary function + requires casting, e.g. the argument of `dpctl.tensor.log` is an + array with integral data type. + """ + return self.acceptance_fn_ + + @property + def nin(self): + """Returns the number of arguments treated as inputs.""" + return 1 + + @property + def nout(self): + """Returns the number of arguments treated as outputs.""" + return 1 + + @property + def types(self): + """Returns information about types supported by + implementation function, using NumPy's character + encoding for data types, e.g. + + :Example: + .. code-block:: python + + dpctl.tensor.sin.types + # Outputs: ['e->e', 'f->f', 'd->d', 'F->F', 'D->D'] + """ + types = self.types_ + if not types: + types = [] + for dt1 in _all_data_types(True, True): + dt2 = self.result_type_resolver_fn_(dt1) + if dt2: + types.append(f"{dt1.char}->{dt2.char}") + self.types_ = types + return types + + def __call__(self, x, /, *, out=None, order="K"): + if not isinstance(x, dpt.usm_ndarray): + raise TypeError(f"Expected dpnp.tensor.usm_ndarray, got {type(x)}") + + if order not in ["C", "F", "K", "A"]: + order = "K" + buf_dt, res_dt = _find_buf_dtype( + x.dtype, + self.result_type_resolver_fn_, + x.sycl_device, + acceptance_fn=self.acceptance_fn_, + ) + if res_dt is None: + raise ValueError( + f"function '{self.name_}' does not support input type " + f"({x.dtype}), " + "and the input could not be safely coerced to any " + "supported types according to the casting rule ''safe''." + ) + + orig_out = out + if out is not None: + if not isinstance(out, dpt.usm_ndarray): + raise TypeError( + f"output array must be of usm_ndarray type, got {type(out)}" + ) + + if not out.flags.writable: + raise ValueError("provided `out` array is read-only") + + if out.shape != x.shape: + raise ValueError( + "The shape of input and output arrays are inconsistent. " + f"Expected output shape is {x.shape}, got {out.shape}" + ) + + if res_dt != out.dtype: + raise ValueError( + f"Output array of type {res_dt} is needed, " + f"got {out.dtype}" + ) + + if ( + buf_dt is None + and ti._array_overlap(x, out) + and not ti._same_logical_tensors(x, out) + ): + # Allocate a temporary buffer to avoid memory overlapping. + # Note if `buf_dt` is not None, a temporary copy of `x` will be + # created, so the array overlap check isn't needed. + out = dpt.empty_like(out) + + if dpt.get_execution_queue((x.sycl_queue, out.sycl_queue)) is None: + raise dpt.ExecutionPlacementError( + "Input and output allocation queues are not compatible" + ) + + exec_q = x.sycl_queue + _manager = SequentialOrderManager[exec_q] + if buf_dt is None: + if out is None: + if order == "K": + out = _empty_like_orderK(x, res_dt) + else: + if order == "A": + order = "F" if x.flags.f_contiguous else "C" + out = dpt.empty_like(x, dtype=res_dt, order=order) + + dep_evs = _manager.submitted_events + ht_unary_ev, unary_ev = self.unary_fn_( + x, out, sycl_queue=exec_q, depends=dep_evs + ) + _manager.add_event_pair(ht_unary_ev, unary_ev) + + if not (orig_out is None or orig_out is out): + # Copy the out data from temporary buffer to original memory + ht_copy_ev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=out, dst=orig_out, sycl_queue=exec_q, depends=[unary_ev] + ) + _manager.add_event_pair(ht_copy_ev, cpy_ev) + out = orig_out + + return out + + if order == "K": + buf = _empty_like_orderK(x, buf_dt) + else: + if order == "A": + order = "F" if x.flags.f_contiguous else "C" + buf = dpt.empty_like(x, dtype=buf_dt, order=order) + + dep_evs = _manager.submitted_events + ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=x, dst=buf, sycl_queue=exec_q, depends=dep_evs + ) + _manager.add_event_pair(ht_copy_ev, copy_ev) + if out is None: + if order == "K": + out = _empty_like_orderK(buf, res_dt) + else: + out = dpt.empty_like(buf, dtype=res_dt, order=order) + + ht, uf_ev = self.unary_fn_( + buf, out, sycl_queue=exec_q, depends=[copy_ev] + ) + _manager.add_event_pair(ht, uf_ev) + + return out + + +class BinaryElementwiseFunc: + """ + Class that implements binary element-wise functions. + + Args: + name (str): + Name of the unary function + result_type_resovle_fn (callable): + Function that takes dtypes of the input and + returns the dtype of the result if the + implementation functions supports it, or + returns `None` otherwise. + binary_dp_impl_fn (callable): + Data-parallel implementation function with signature + `impl_fn(src1: usm_ndarray, src2: usm_ndarray, dst: usm_ndarray, + sycl_queue: SyclQueue, depends: Optional[List[SyclEvent]])` + where the `src1` and `src2` are the argument arrays, `dst` is the + array to be populated with function values, + i.e. `dst=func(src1, src2)`. + The `impl_fn` is expected to return a 2-tuple of `SyclEvent`s. + The first event corresponds to data-management host tasks, + including lifetime management of argument Python objects to ensure + that their associated USM allocation is not freed before offloaded + computational tasks complete execution, while the second event + corresponds to computational tasks associated with function + evaluation. + docs (str): + Documentation string for the unary function. + binary_inplace_fn (callable, optional): + Data-parallel implementation function with signature + `impl_fn(src: usm_ndarray, dst: usm_ndarray, + sycl_queue: SyclQueue, depends: Optional[List[SyclEvent]])` + where the `src` is the argument array, `dst` is the + array to be populated with function values, + i.e. `dst=func(dst, src)`. + The `impl_fn` is expected to return a 2-tuple of `SyclEvent`s. + The first event corresponds to data-management host tasks, + including async lifetime management of Python arguments, + while the second event corresponds to computational tasks + associated with function evaluation. + acceptance_fn (callable, optional): + Function to influence type promotion behavior of this binary + function. The function takes 6 arguments: + arg1_dtype - Data type of the first argument + arg2_dtype - Data type of the second argument + ret_buf1_dtype - Data type the first argument would be cast to + ret_buf2_dtype - Data type the second argument would be cast to + res_dtype - Data type of the output array with function values + sycl_dev - The :class:`dpctl.SyclDevice` where the function + evaluation is carried out. + The function is only called when both arguments of the binary + function require casting, e.g. both arguments of + `dpctl.tensor.logaddexp` are arrays with integral data type. + """ + + def __init__( + self, + name, + result_type_resolver_fn, + binary_dp_impl_fn, + docs, + binary_inplace_fn=None, + acceptance_fn=None, + weak_type_resolver=None, + ): + self.__name__ = "BinaryElementwiseFunc" + self.name_ = name + self.result_type_resolver_fn_ = result_type_resolver_fn + self.types_ = None + self.binary_fn_ = binary_dp_impl_fn + self.binary_inplace_fn_ = binary_inplace_fn + self.__doc__ = docs + if callable(acceptance_fn): + self.acceptance_fn_ = acceptance_fn + else: + self.acceptance_fn_ = _acceptance_fn_default_binary + if callable(weak_type_resolver): + self.weak_type_resolver_ = weak_type_resolver + else: + self.weak_type_resolver_ = _resolve_weak_types + + def __str__(self): + return f"<{self.__name__} '{self.name_}'>" + + def __repr__(self): + return f"<{self.__name__} '{self.name_}'>" + + def get_implementation_function(self): + """Returns the out-of-place implementation + function for this elementwise binary function. + + """ + return self.binary_fn_ + + def get_implementation_inplace_function(self): + """Returns the in-place implementation + function for this elementwise binary function. + + """ + return self.binary_inplace_fn_ + + def get_type_result_resolver_function(self): + """Returns the type resolver function for this + elementwise binary function. + """ + return self.result_type_resolver_fn_ + + def get_type_promotion_path_acceptance_function(self): + """Returns the acceptance function for this + elementwise binary function. + + Acceptance function influences the type promotion + behavior of this binary function. + The function takes 6 arguments: + arg1_dtype - Data type of the first argument + arg2_dtype - Data type of the second argument + ret_buf1_dtype - Data type the first argument would be cast to + ret_buf2_dtype - Data type the second argument would be cast to + res_dtype - Data type of the output array with function values + sycl_dev - :class:`dpctl.SyclDevice` on which function evaluation + is carried out. + + The acceptance function is only invoked if both input arrays must be + cast to intermediary data types, as would happen during call of + `dpctl.tensor.hypot` with both arrays being of integral data type. + """ + return self.acceptance_fn_ + + def get_array_dtype_scalar_type_resolver_function(self): + """Returns the function which determines how to treat + Python scalar types for this elementwise binary function. + + Resolver influences what type the scalar will be + treated as prior to type promotion behavior. + The function takes 3 arguments: + + Args: + o1_dtype (object, dtype): + A class representing a Python scalar type or a ``dtype`` + o2_dtype (object, dtype): + A class representing a Python scalar type or a ``dtype`` + sycl_dev (:class:`dpctl.SyclDevice`): + Device on which function evaluation is carried out. + + One of ``o1_dtype`` and ``o2_dtype`` must be a ``dtype`` instance. + """ + return self.weak_type_resolver_ + + @property + def nin(self): + """Returns the number of arguments treated as inputs.""" + return 2 + + @property + def nout(self): + """Returns the number of arguments treated as outputs.""" + return 1 + + @property + def types(self): + """Returns information about types supported by + implementation function, using NumPy's character + encoding for data types, e.g. + + :Example: + .. code-block:: python + + dpctl.tensor.divide.types + # Outputs: ['ee->e', 'ff->f', 'fF->F', 'dd->d', 'dD->D', + # 'Ff->F', 'FF->F', 'Dd->D', 'DD->D'] + """ + types = self.types_ + if not types: + types = [] + _all_dtypes = _all_data_types(True, True) + for dt1 in _all_dtypes: + for dt2 in _all_dtypes: + dt3 = self.result_type_resolver_fn_(dt1, dt2) + if dt3: + types.append(f"{dt1.char}{dt2.char}->{dt3.char}") + self.types_ = types + return types + + def __call__(self, o1, o2, /, *, out=None, order="K"): + if order not in ["K", "C", "F", "A"]: + order = "K" + q1, o1_usm_type = _get_queue_usm_type(o1) + q2, o2_usm_type = _get_queue_usm_type(o2) + if q1 is None and q2 is None: + raise dpt.ExecutionPlacementError( + "Execution placement can not be unambiguously inferred " + "from input arguments. " + "One of the arguments must represent USM allocation and " + "expose `__sycl_usm_array_interface__` property" + ) + if q1 is None: + exec_q = q2 + res_usm_type = o2_usm_type + elif q2 is None: + exec_q = q1 + res_usm_type = o1_usm_type + else: + exec_q = dpt.get_execution_queue((q1, q2)) + if exec_q is None: + raise dpt.ExecutionPlacementError( + "Execution placement can not be unambiguously inferred " + "from input arguments." + ) + res_usm_type = dpt.get_coerced_usm_type( + ( + o1_usm_type, + o2_usm_type, + ) + ) + dpt.validate_usm_type(res_usm_type, allow_none=False) + o1_shape = _get_shape(o1) + o2_shape = _get_shape(o2) + if not all( + isinstance(s, (tuple, list)) + for s in ( + o1_shape, + o2_shape, + ) + ): + raise TypeError( + "Shape of arguments can not be inferred. " + "Arguments are expected to be " + "lists, tuples, or both" + ) + try: + res_shape = _broadcast_shape_impl( + [ + o1_shape, + o2_shape, + ] + ) + except ValueError: + raise ValueError( + "operands could not be broadcast together with shapes " + f"{o1_shape} and {o2_shape}" + ) + sycl_dev = exec_q.sycl_device + o1_dtype = _get_dtype(o1, sycl_dev) + o2_dtype = _get_dtype(o2, sycl_dev) + if not all(_validate_dtype(o) for o in (o1_dtype, o2_dtype)): + raise ValueError("Operands have unsupported data types") + + o1_dtype, o2_dtype = self.weak_type_resolver_( + o1_dtype, o2_dtype, sycl_dev + ) + + buf1_dt, buf2_dt, res_dt = _find_buf_dtype2( + o1_dtype, + o2_dtype, + self.result_type_resolver_fn_, + sycl_dev, + acceptance_fn=self.acceptance_fn_, + ) + + if res_dt is None: + raise ValueError( + f"function '{self.name_}' does not support input types " + f"({o1_dtype}, {o2_dtype}), " + "and the inputs could not be safely coerced to any " + "supported types according to the casting rule ''safe''." + ) + + orig_out = out + _manager = SequentialOrderManager[exec_q] + if out is not None: + if not isinstance(out, dpt.usm_ndarray): + raise TypeError( + f"output array must be of usm_ndarray type, got {type(out)}" + ) + + if not out.flags.writable: + raise ValueError("provided `out` array is read-only") + + if out.shape != res_shape: + raise ValueError( + "The shape of input and output arrays are inconsistent. " + f"Expected output shape is {res_shape}, got {out.shape}" + ) + + if res_dt != out.dtype: + raise ValueError( + f"Output array of type {res_dt} is needed, " + f"got {out.dtype}" + ) + + if dpt.get_execution_queue((exec_q, out.sycl_queue)) is None: + raise dpt.ExecutionPlacementError( + "Input and output allocation queues are not compatible" + ) + + if isinstance(o1, dpt.usm_ndarray): + if ti._array_overlap(o1, out) and buf1_dt is None: + if not ti._same_logical_tensors(o1, out): + out = dpt.empty_like(out) + elif self.binary_inplace_fn_ is not None: + # if there is a dedicated in-place kernel + # it can be called here, otherwise continues + if isinstance(o2, dpt.usm_ndarray): + src2 = o2 + if ( + ti._array_overlap(o2, out) + and not ti._same_logical_tensors(o2, out) + and buf2_dt is None + ): + buf2_dt = o2_dtype + else: + src2 = dpt.asarray( + o2, dtype=o2_dtype, sycl_queue=exec_q + ) + if buf2_dt is None: + if src2.shape != res_shape: + src2 = dpt.broadcast_to(src2, res_shape) + dep_evs = _manager.submitted_events + ht_, comp_ev = self.binary_inplace_fn_( + lhs=o1, + rhs=src2, + sycl_queue=exec_q, + depends=dep_evs, + ) + _manager.add_event_pair(ht_, comp_ev) + else: + buf2 = dpt.empty_like(src2, dtype=buf2_dt) + dep_evs = _manager.submitted_events + ( + ht_copy_ev, + copy_ev, + ) = ti._copy_usm_ndarray_into_usm_ndarray( + src=src2, + dst=buf2, + sycl_queue=exec_q, + depends=dep_evs, + ) + _manager.add_event_pair(ht_copy_ev, copy_ev) + + buf2 = dpt.broadcast_to(buf2, res_shape) + ht_, bf_ev = self.binary_inplace_fn_( + lhs=o1, + rhs=buf2, + sycl_queue=exec_q, + depends=[copy_ev], + ) + _manager.add_event_pair(ht_, bf_ev) + + return out + + if isinstance(o2, dpt.usm_ndarray): + if ( + ti._array_overlap(o2, out) + and not ti._same_logical_tensors(o2, out) + and buf2_dt is None + ): + # should not reach if out is reallocated + # after being checked against o1 + out = dpt.empty_like(out) + + if isinstance(o1, dpt.usm_ndarray): + src1 = o1 + else: + src1 = dpt.asarray(o1, dtype=o1_dtype, sycl_queue=exec_q) + if isinstance(o2, dpt.usm_ndarray): + src2 = o2 + else: + src2 = dpt.asarray(o2, dtype=o2_dtype, sycl_queue=exec_q) + + if order == "A": + order = ( + "F" + if all( + arr.flags.f_contiguous + for arr in ( + src1, + src2, + ) + ) + else "C" + ) + + if buf1_dt is None and buf2_dt is None: + if out is None: + if order == "K": + out = _empty_like_pair_orderK( + src1, src2, res_dt, res_shape, res_usm_type, exec_q + ) + else: + out = dpt.empty( + res_shape, + dtype=res_dt, + usm_type=res_usm_type, + sycl_queue=exec_q, + order=order, + ) + if src1.shape != res_shape: + src1 = dpt.broadcast_to(src1, res_shape) + if src2.shape != res_shape: + src2 = dpt.broadcast_to(src2, res_shape) + deps_ev = _manager.submitted_events + ht_binary_ev, binary_ev = self.binary_fn_( + src1=src1, + src2=src2, + dst=out, + sycl_queue=exec_q, + depends=deps_ev, + ) + _manager.add_event_pair(ht_binary_ev, binary_ev) + if not (orig_out is None or orig_out is out): + # Copy the out data from temporary buffer to original memory + ht_copy_out_ev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=out, + dst=orig_out, + sycl_queue=exec_q, + depends=[binary_ev], + ) + _manager.add_event_pair(ht_copy_out_ev, cpy_ev) + out = orig_out + return out + elif buf1_dt is None: + if order == "K": + buf2 = _empty_like_orderK(src2, buf2_dt) + else: + buf2 = dpt.empty_like(src2, dtype=buf2_dt, order=order) + dep_evs = _manager.submitted_events + ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=src2, dst=buf2, sycl_queue=exec_q, depends=dep_evs + ) + _manager.add_event_pair(ht_copy_ev, copy_ev) + if out is None: + if order == "K": + out = _empty_like_pair_orderK( + src1, buf2, res_dt, res_shape, res_usm_type, exec_q + ) + else: + out = dpt.empty( + res_shape, + dtype=res_dt, + usm_type=res_usm_type, + sycl_queue=exec_q, + order=order, + ) + + if src1.shape != res_shape: + src1 = dpt.broadcast_to(src1, res_shape) + buf2 = dpt.broadcast_to(buf2, res_shape) + ht_binary_ev, binary_ev = self.binary_fn_( + src1=src1, + src2=buf2, + dst=out, + sycl_queue=exec_q, + depends=[copy_ev], + ) + _manager.add_event_pair(ht_binary_ev, binary_ev) + if not (orig_out is None or orig_out is out): + # Copy the out data from temporary buffer to original memory + ht_copy_out_ev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=out, + dst=orig_out, + sycl_queue=exec_q, + depends=[binary_ev], + ) + _manager.add_event_pair(ht_copy_out_ev, cpy_ev) + out = orig_out + return out + elif buf2_dt is None: + if order == "K": + buf1 = _empty_like_orderK(src1, buf1_dt) + else: + buf1 = dpt.empty_like(src1, dtype=buf1_dt, order=order) + dep_evs = _manager.submitted_events + ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=src1, dst=buf1, sycl_queue=exec_q, depends=dep_evs + ) + _manager.add_event_pair(ht_copy_ev, copy_ev) + if out is None: + if order == "K": + out = _empty_like_pair_orderK( + buf1, src2, res_dt, res_shape, res_usm_type, exec_q + ) + else: + out = dpt.empty( + res_shape, + dtype=res_dt, + usm_type=res_usm_type, + sycl_queue=exec_q, + order=order, + ) + + buf1 = dpt.broadcast_to(buf1, res_shape) + if src2.shape != res_shape: + src2 = dpt.broadcast_to(src2, res_shape) + ht_binary_ev, binary_ev = self.binary_fn_( + src1=buf1, + src2=src2, + dst=out, + sycl_queue=exec_q, + depends=[copy_ev], + ) + _manager.add_event_pair(ht_binary_ev, binary_ev) + if not (orig_out is None or orig_out is out): + # Copy the out data from temporary buffer to original memory + ht_copy_out_ev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=out, + dst=orig_out, + sycl_queue=exec_q, + depends=[binary_ev], + ) + _manager.add_event_pair(ht_copy_out_ev, cpy_ev) + out = orig_out + return out + + if order == "K": + if src1.flags.c_contiguous and src2.flags.c_contiguous: + order = "C" + elif src1.flags.f_contiguous and src2.flags.f_contiguous: + order = "F" + if order == "K": + buf1 = _empty_like_orderK(src1, buf1_dt) + else: + buf1 = dpt.empty_like(src1, dtype=buf1_dt, order=order) + dep_evs = _manager.submitted_events + ht_copy1_ev, copy1_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=src1, dst=buf1, sycl_queue=exec_q, depends=dep_evs + ) + _manager.add_event_pair(ht_copy1_ev, copy1_ev) + if order == "K": + buf2 = _empty_like_orderK(src2, buf2_dt) + else: + buf2 = dpt.empty_like(src2, dtype=buf2_dt, order=order) + ht_copy2_ev, copy2_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=src2, dst=buf2, sycl_queue=exec_q, depends=dep_evs + ) + _manager.add_event_pair(ht_copy2_ev, copy2_ev) + if out is None: + if order == "K": + out = _empty_like_pair_orderK( + buf1, buf2, res_dt, res_shape, res_usm_type, exec_q + ) + else: + out = dpt.empty( + res_shape, + dtype=res_dt, + usm_type=res_usm_type, + sycl_queue=exec_q, + order=order, + ) + + buf1 = dpt.broadcast_to(buf1, res_shape) + buf2 = dpt.broadcast_to(buf2, res_shape) + ht_, bf_ev = self.binary_fn_( + src1=buf1, + src2=buf2, + dst=out, + sycl_queue=exec_q, + depends=[copy1_ev, copy2_ev], + ) + _manager.add_event_pair(ht_, bf_ev) + return out + + def _inplace_op(self, o1, o2): + if self.binary_inplace_fn_ is None: + raise ValueError( + "binary function does not have a dedicated in-place " + "implementation" + ) + if not isinstance(o1, dpt.usm_ndarray): + raise TypeError( + "Expected first argument to be " + f"dpnp.tensor.usm_ndarray, got {type(o1)}" + ) + if not o1.flags.writable: + raise ValueError("provided left-hand side array is read-only") + q1, o1_usm_type = o1.sycl_queue, o1.usm_type + q2, o2_usm_type = _get_queue_usm_type(o2) + if q2 is None: + exec_q = q1 + res_usm_type = o1_usm_type + else: + exec_q = dpt.get_execution_queue((q1, q2)) + if exec_q is None: + raise dpt.ExecutionPlacementError( + "Execution placement can not be unambiguously inferred " + "from input arguments." + ) + res_usm_type = dpt.get_coerced_usm_type( + ( + o1_usm_type, + o2_usm_type, + ) + ) + dpt.validate_usm_type(res_usm_type, allow_none=False) + o1_shape = o1.shape + o2_shape = _get_shape(o2) + if not isinstance(o2_shape, (tuple, list)): + raise TypeError( + "Shape of second argument can not be inferred. " + "Expected list or tuple." + ) + try: + res_shape = _broadcast_shape_impl( + [ + o1_shape, + o2_shape, + ] + ) + except ValueError: + raise ValueError( + "operands could not be broadcast together with shapes " + f"{o1_shape} and {o2_shape}" + ) + + if res_shape != o1_shape: + raise ValueError( + "The shape of the non-broadcastable left-hand " + f"side {o1_shape} is inconsistent with the " + f"broadcast shape {res_shape}." + ) + + sycl_dev = exec_q.sycl_device + o1_dtype = o1.dtype + o2_dtype = _get_dtype(o2, sycl_dev) + if not _validate_dtype(o2_dtype): + raise ValueError("Operand has an unsupported data type") + + o1_dtype, o2_dtype = self.weak_type_resolver_( + o1_dtype, o2_dtype, sycl_dev + ) + + buf_dt, res_dt = _find_buf_dtype_in_place_op( + o1_dtype, + o2_dtype, + self.result_type_resolver_fn_, + sycl_dev, + ) + + if res_dt is None: + raise ValueError( + f"function '{self.name_}' does not support input types " + f"({o1_dtype}, {o2_dtype}), " + "and the inputs could not be safely coerced to any " + "supported types according to the casting rule " + "''same_kind''." + ) + + if res_dt != o1_dtype: + raise ValueError( + f"Output array of type {res_dt} is needed, " f"got {o1_dtype}" + ) + + _manager = SequentialOrderManager[exec_q] + if isinstance(o2, dpt.usm_ndarray): + src2 = o2 + if ( + ti._array_overlap(o2, o1) + and not ti._same_logical_tensors(o2, o1) + and buf_dt is None + ): + buf_dt = o2_dtype + else: + src2 = dpt.asarray(o2, dtype=o2_dtype, sycl_queue=exec_q) + if buf_dt is None: + if src2.shape != res_shape: + src2 = dpt.broadcast_to(src2, res_shape) + dep_evs = _manager.submitted_events + ht_, comp_ev = self.binary_inplace_fn_( + lhs=o1, + rhs=src2, + sycl_queue=exec_q, + depends=dep_evs, + ) + _manager.add_event_pair(ht_, comp_ev) + else: + buf = dpt.empty_like(src2, dtype=buf_dt) + dep_evs = _manager.submitted_events + ( + ht_copy_ev, + copy_ev, + ) = ti._copy_usm_ndarray_into_usm_ndarray( + src=src2, + dst=buf, + sycl_queue=exec_q, + depends=dep_evs, + ) + _manager.add_event_pair(ht_copy_ev, copy_ev) + + buf = dpt.broadcast_to(buf, res_shape) + ht_, bf_ev = self.binary_inplace_fn_( + lhs=o1, + rhs=buf, + sycl_queue=exec_q, + depends=[copy_ev], + ) + _manager.add_event_pair(ht_, bf_ev) + + return o1 diff --git a/dpnp/tensor/_elementwise_funcs.py b/dpnp/tensor/_elementwise_funcs.py new file mode 100644 index 000000000000..4040f33bf38e --- /dev/null +++ b/dpnp/tensor/_elementwise_funcs.py @@ -0,0 +1,2276 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import dpnp.tensor._tensor_elementwise_impl as ti + +from ._elementwise_common import BinaryElementwiseFunc, UnaryElementwiseFunc +from ._type_utils import ( + _acceptance_fn_divide, + _acceptance_fn_negative, + _acceptance_fn_reciprocal, + _acceptance_fn_round, + _acceptance_fn_subtract, + _resolve_weak_types_all_py_ints, +) + +# U01: ==== ABS (x) +_abs_docstring_ = r""" +abs(x, /, \*, out=None, order='K') + +Calculates the absolute value for each element `x_i` of input array `x`. + +Args: + x (usm_ndarray): + Input array. May have any data type. + out (Union[usm_ndarray, None], optional): + Output array to populate. + Array must have the correct shape and the expected data type. + order ("C","F","A","K", optional): + Memory layout of the new output array, + if parameter `out` is ``None``. + Default: `"K"`. + +Returns: + usm_ndarray: + An array containing the element-wise absolute values. + For complex input, the absolute value is its magnitude. + If `x` has a real-valued data type, the returned array has the + same data type as `x`. If `x` has a complex floating-point data type, + the returned array has a real-valued floating-point data type whose + precision matches the precision of `x`. +""" + +abs = UnaryElementwiseFunc("abs", ti._abs_result_type, ti._abs, _abs_docstring_) +del _abs_docstring_ + +# U02: ==== ACOS (x) +_acos_docstring = r""" +acos(x, /, \*, out=None, order='K') + +Computes inverse cosine for each element `x_i` for input array `x`. + +Args: + x (usm_ndarray): + Input array, expected to have a floating-point data type. + out (Union[usm_ndarray, None], optional): + Output array to populate. + Array must have the correct shape and the expected data type. + order ("C","F","A","K", optional): + Memory layout of the new output array, if parameter + `out` is ``None``. + Default: "K". + +Returns: + usm_ndarray: + An array containing the element-wise inverse cosine, in radians + and in the closed interval :math:`[0, \pi]`. The data type of the + returned array is determined by the Type Promotion Rules. +""" + +acos = UnaryElementwiseFunc( + "acos", ti._acos_result_type, ti._acos, _acos_docstring +) +del _acos_docstring + +# U03: ===== ACOSH (x) +_acosh_docstring = r""" +acosh(x, /, \*, out=None, order='K') + +Computes inverse hyperbolic cosine for each element `x_i` for input array `x`. + +Args: + x (usm_ndarray): + Input array, expected to have a floating-point data type. + out (Union[usm_ndarray, None], optional): + Output array to populate. + Array must have the correct shape and the expected data type. + order ("C","F","A","K", optional): + Memory layout of the new output array, if parameter + `out` is ``None``. + Default: "K". + +Returns: + usm_ndarray: + An array containing the element-wise inverse hyperbolic cosine, in + radians and in the half-closed interval :math:`[0, \infty)`. The data + type of the returned array is determined by the Type Promotion Rules. +""" + +acosh = UnaryElementwiseFunc( + "acosh", ti._acosh_result_type, ti._acosh, _acosh_docstring +) +del _acosh_docstring + +# B01: ===== ADD (x1, x2) + +_add_docstring_ = r""" +add(x1, x2, /, \*, out=None, order='K') + +Calculates the sum for each element `x1_i` of the input array `x1` with +the respective element `x2_i` of the input array `x2`. + +Args: + x1 (usm_ndarray): + First input array. May have any data type. + x2 (usm_ndarray): + Second input array. May have any data type. + out (Union[usm_ndarray, None], optional): + Output array to populate. + Array must have the correct shape and the expected data type. + order ("C","F","A","K", optional): + Memory layout of the new output array, if parameter + `out` is ``None``. + Default: "K". + +Returns: + usm_ndarray: + An array containing the element-wise sums. The data type of the + returned array is determined by the Type Promotion Rules. +""" +add = BinaryElementwiseFunc( + "add", + ti._add_result_type, + ti._add, + _add_docstring_, + binary_inplace_fn=ti._add_inplace, +) +del _add_docstring_ + +# U04: ===== ASIN (x) +_asin_docstring = r""" +asin(x, /, \*, out=None, order='K') + +Computes inverse sine for each element `x_i` for input array `x`. + +Args: + x (usm_ndarray): + Input array, expected to have a floating-point data type. + out (Union[usm_ndarray, None], optional): + Output array to populate. + Array must have the correct shape and the expected data type. + order ("C","F","A","K", optional): + Memory layout of the new output array, if parameter + `out` is ``None``. + Default: "K". + +Returns: + usm_ndarray: + An array containing the element-wise inverse sine, in radians + and in the closed interval :math:`[-\pi/2, \pi/2]`. The data type + of the returned array is determined by the Type Promotion Rules. +""" + +asin = UnaryElementwiseFunc( + "asin", ti._asin_result_type, ti._asin, _asin_docstring +) +del _asin_docstring + +# U05: ===== ASINH (x) +_asinh_docstring = r""" +asinh(x, /, \*, out=None, order='K') + +Computes inverse hyperbolic sine for each element `x_i` for input array `x`. + +Args: + x (usm_ndarray): + Input array, expected to have a floating-point data type. + out (Union[usm_ndarray, None], optional): + Output array to populate. + Array must have the correct shape and the expected data type. + order ("C","F","A","K", optional): + Memory layout of the new output array, if parameter + `out` is ``None``. + Default: "K". + +Returns: + usm_ndarray: + An array containing the element-wise inverse hyperbolic sine, in + radians. The data type of the returned array is determined by + the Type Promotion Rules. +""" + +asinh = UnaryElementwiseFunc( + "asinh", ti._asinh_result_type, ti._asinh, _asinh_docstring +) +del _asinh_docstring + +# U06: ===== ATAN (x) +_atan_docstring = r""" +atan(x, /, \*, out=None, order='K') + +Computes inverse tangent for each element `x_i` for input array `x`. + +Args: + x (usm_ndarray): + Input array, expected to have a floating-point data type. + out (Union[usm_ndarray, None], optional): + Output array to populate. + Array must have the correct shape and the expected data type. + order ("C","F","A","K", optional): + Memory layout of the new output array, if parameter + `out` is ``None``. + Default: "K". + +Returns: + usm_ndarray: + An array containing the element-wise inverse tangent, in radians + and in the closed interval :math:`[-\pi/2, \pi/2]`. The data type + of the returned array is determined by the Type Promotion Rules. +""" + +atan = UnaryElementwiseFunc( + "atan", ti._atan_result_type, ti._atan, _atan_docstring +) +del _atan_docstring + +# B02: ===== ATAN2 (x1, x2) +_atan2_docstring_ = r""" +atan2(x1, x2, /, \*, out=None, order='K') + +Calculates the inverse tangent of the quotient `x1_i/x2_i` for each element +`x1_i` of the input array `x1` with the respective element `x2_i` of the +input array `x2`. Each element-wise result is expressed in radians. + +Args: + x1 (usm_ndarray): + First input array, expected to have a real-valued floating-point + data type. + x2 (usm_ndarray): + Second input array, also expected to have a real-valued + floating-point data type. + out (Union[usm_ndarray, None], optional): + Output array to populate. + Array must have the correct shape and the expected data type. + order ("C","F","A","K", optional): + Memory layout of the new output array, if parameter + `out` is ``None``. + Default: "K". + +Returns: + usm_ndarray: + An array containing the inverse tangent of the quotient `x1`/`x2`. + The returned array must have a real-valued floating-point data type + determined by Type Promotion Rules. +""" + +atan2 = BinaryElementwiseFunc( + "atan2", ti._atan2_result_type, ti._atan2, _atan2_docstring_ +) +del _atan2_docstring_ + +# U07: ===== ATANH (x) +_atanh_docstring = r""" +atanh(x, /, \*, out=None, order='K') + +Computes hyperbolic inverse tangent for each element `x_i` for input array `x`. + +Args: + x (usm_ndarray): + Input array, expected to have a floating-point data type. + out (Union[usm_ndarray, None], optional): + Output array to populate. + Array must have the correct shape and the expected data type. + order ("C","F","A","K", optional): + Memory layout of the new output array, if parameter + `out` is ``None``. + Default: "K". + +Returns: + usm_ndarray: + An array containing the element-wise hyperbolic inverse tangent, in + radians. The data type of the returned array is determined by + the Type Promotion Rules. +""" + +atanh = UnaryElementwiseFunc( + "atanh", ti._atanh_result_type, ti._atanh, _atanh_docstring +) +del _atanh_docstring + +# B03: ===== BITWISE_AND (x1, x2) +_bitwise_and_docstring_ = r""" +bitwise_and(x1, x2, /, \*, out=None, order='K') + +Computes the bitwise AND of the underlying binary representation of each +element `x1_i` of the input array `x1` with the respective element `x2_i` +of the input array `x2`. + +Args: + x1 (usm_ndarray): + First input array, expected to have integer or boolean data type. + x2 (usm_ndarray): + Second input array, also expected to have integer or boolean data + type. + out (Union[usm_ndarray, None], optional): + Output array to populate. + Array must have the correct shape and the expected data type. + order ("C","F","A","K", optional): + Memory layout of the new output array, if parameter + `out` is ``None``. + Default: "K". + +Returns: + usm_ndarray: + An array containing the element-wise results. The data type + of the returned array is determined by the Type Promotion Rules. +""" + +bitwise_and = BinaryElementwiseFunc( + "bitwise_and", + ti._bitwise_and_result_type, + ti._bitwise_and, + _bitwise_and_docstring_, + binary_inplace_fn=ti._bitwise_and_inplace, +) +del _bitwise_and_docstring_ + +# B04: ===== BITWISE_LEFT_SHIFT (x1, x2) +_bitwise_left_shift_docstring_ = r""" +bitwise_left_shift(x1, x2, /, \*, out=None, order='K') + +Shifts the bits of each element `x1_i` of the input array x1 to the left by +appending `x2_i` (i.e., the respective element in the input array `x2`) zeros to +the right of `x1_i`. + +Args: + x1 (usm_ndarray): + First input array, expected to have integer data type. + x2 (usm_ndarray): + Second input array, also expected to have integer data type. + Each element must be greater than or equal to 0. + out (Union[usm_ndarray, None], optional): + Output array to populate. + Array must have the correct shape and the expected data type. + order ("C","F","A","K", optional): + Memory layout of the new output array, if parameter + `out` is ``None``. + Default: "K". + +Returns: + usm_ndarray: + An array containing the element-wise results. The data type + of the returned array is determined by the Type Promotion Rules. +""" + +bitwise_left_shift = BinaryElementwiseFunc( + "bitwise_left_shift", + ti._bitwise_left_shift_result_type, + ti._bitwise_left_shift, + _bitwise_left_shift_docstring_, + binary_inplace_fn=ti._bitwise_left_shift_inplace, +) +del _bitwise_left_shift_docstring_ + +# U08: ===== BITWISE_INVERT (x) +_bitwise_invert_docstring = r""" +bitwise_invert(x, /, \*, out=None, order='K') + +Inverts (flips) each bit for each element `x_i` of the input array `x`. + +Args: + x (usm_ndarray): + Input array, expected to have integer or boolean data type. + out (Union[usm_ndarray, None], optional): + Output array to populate. + Array must have the correct shape and the expected data type. + order ("C","F","A","K", optional): + Memory layout of the new output array, if parameter + `out` is ``None``. + Default: "K". + +Returns: + usm_ndarray: + An array containing the element-wise results. + The data type of the returned array is same as the data type of the + input array. +""" + +bitwise_invert = UnaryElementwiseFunc( + "bitwise_invert", + ti._bitwise_invert_result_type, + ti._bitwise_invert, + _bitwise_invert_docstring, +) +del _bitwise_invert_docstring + +# B05: ===== BITWISE_OR (x1, x2) +_bitwise_or_docstring_ = r""" +bitwise_or(x1, x2, /, \*, out=None, order='K') + +Computes the bitwise OR of the underlying binary representation of each +element `x1_i` of the input array `x1` with the respective element `x2_i` +of the input array `x2`. + +Args: + x1 (usm_ndarray): + First input array, expected to have integer or boolean data type. + x2 (usm_ndarray): + Second input array, also expected to have integer or boolean data + type. + out (Union[usm_ndarray, None], optional): + Output array to populate. + Array must have the correct shape and the expected data type. + order ("C","F","A","K", optional): + Memory layout of the new output array, if parameter + `out` is ``None``. + Default: "K". + +Returns: + usm_ndarray: + An array containing the element-wise results. The data type + of the returned array is determined by the Type Promotion Rules. +""" + +bitwise_or = BinaryElementwiseFunc( + "bitwise_or", + ti._bitwise_or_result_type, + ti._bitwise_or, + _bitwise_or_docstring_, + binary_inplace_fn=ti._bitwise_or_inplace, +) +del _bitwise_or_docstring_ + +# B06: ===== BITWISE_RIGHT_SHIFT (x1, x2) +_bitwise_right_shift_docstring_ = r""" +bitwise_right_shift(x1, x2, /, \*, out=None, order='K') + +Shifts the bits of each element `x1_i` of the input array `x1` to the right +according to the respective element `x2_i` of the input array `x2`. + +Args: + x1 (usm_ndarray): + First input array, expected to have integer data type. + x2 (usm_ndarray): + Second input array, also expected to have integer data type. + Each element must be greater than or equal to 0. + out (Union[usm_ndarray, None], optional): + Output array to populate. + Array must have the correct shape and the expected data type. + order ("C","F","A","K", optional): + Memory layout of the new output array, if parameter + `out` is ``None``. + Default: "K". + +Returns: + usm_ndarray: + An array containing the element-wise results. The data type + of the returned array is determined by the Type Promotion Rules. +""" + +bitwise_right_shift = BinaryElementwiseFunc( + "bitwise_right_shift", + ti._bitwise_right_shift_result_type, + ti._bitwise_right_shift, + _bitwise_right_shift_docstring_, + binary_inplace_fn=ti._bitwise_right_shift_inplace, +) +del _bitwise_right_shift_docstring_ + + +# B07: ===== BITWISE_XOR (x1, x2) +_bitwise_xor_docstring_ = r""" +bitwise_xor(x1, x2, /, \*, out=None, order='K') + +Computes the bitwise XOR of the underlying binary representation of each +element `x1_i` of the input array `x1` with the respective element `x2_i` +of the input array `x2`. + +Args: + x1 (usm_ndarray): + First input array, expected to have integer or boolean data type. + x2 (usm_ndarray): + Second input array, also expected to have integer or boolean data + type. + out (Union[usm_ndarray, None], optional): + Output array to populate. + Array must have the correct shape and the expected data type. + order ("C","F","A","K", optional): + Memory layout of the new output array, if parameter + `out` is ``None``. + Default: "K". + +Returns: + usm_ndarray: + An array containing the element-wise results. The data type + of the returned array is determined by the Type Promotion Rules. +""" + +bitwise_xor = BinaryElementwiseFunc( + "bitwise_xor", + ti._bitwise_xor_result_type, + ti._bitwise_xor, + _bitwise_xor_docstring_, + binary_inplace_fn=ti._bitwise_xor_inplace, +) +del _bitwise_xor_docstring_ + +# U09: ==== CEIL (x) +_ceil_docstring = r""" +ceil(x, /, \*, out=None, order='K') + +Returns the ceiling for each element `x_i` for input array `x`. + +The ceil of `x_i` is the smallest integer `n`, such that `n >= x_i`. + +Args: + x (usm_ndarray): + Input array, expected to have a boolean or real-valued data type. + out (Union[usm_ndarray, None], optional): + Output array to populate. + Array must have the correct shape and the expected data type. + order ("C","F","A","K", optional): + Memory layout of the new output array, if parameter + `out` is ``None``. + Default: "K". + +Returns: + usm_ndarray: + An array containing the element-wise ceiling. +""" + +ceil = UnaryElementwiseFunc( + "ceil", ti._ceil_result_type, ti._ceil, _ceil_docstring +) +del _ceil_docstring + +# U10: ==== CONJ (x) +_conj_docstring = r""" +conj(x, /, \*, out=None, order='K') + +Computes conjugate of each element `x_i` for input array `x`. + +Args: + x (usm_ndarray): + Input array. May have any data type. + out (Union[usm_ndarray, None], optional): + Output array to populate. + Array must have the correct shape and the expected data type. + order ("C","F","A","K", optional): + Memory layout of the new output array, if parameter + `out` is ``None``. + Default: "K". + +Returns: + usm_ndarray: + An array containing the element-wise conjugate values. +""" + +conj = UnaryElementwiseFunc( + "conj", ti._conj_result_type, ti._conj, _conj_docstring +) +del _conj_docstring + +# U11: ==== COS (x) +_cos_docstring = r""" +cos(x, /, \*, out=None, order='K') + +Computes cosine for each element `x_i` for input array `x`. + +Args: + x (usm_ndarray): + Input array, expected to have a floating-point data type. + out (Union[usm_ndarray, None], optional): + Output array to populate. + Array must have the correct shape and the expected data type. + order ("C","F","A","K", optional): + Memory layout of the new output array, if parameter + `out` is ``None``. + Default: "K". + +Returns: + usm_ndarray: + An array containing the element-wise cosine. The data type + of the returned array is determined by the Type Promotion Rules. +""" + +cos = UnaryElementwiseFunc("cos", ti._cos_result_type, ti._cos, _cos_docstring) +del _cos_docstring + +# U12: ==== COSH (x) +_cosh_docstring = r""" +cosh(x, /, \*, out=None, order='K') + +Computes hyperbolic cosine for each element `x_i` for input array `x`. + +Args: + x (usm_ndarray): + Input array, expected to have a floating-point data type. + out (Union[usm_ndarray, None], optional): + Output array to populate. + Array must have the correct shape and the expected data type. + order ("C","F","A","K", optional): + Memory layout of the new output array, if parameter + `out` is ``None``. + Default: "K". + +Returns: + usm_ndarray: + An array containing the element-wise hyperbolic cosine. The data type + of the returned array is determined by the Type Promotion Rules. +""" + +cosh = UnaryElementwiseFunc( + "cosh", ti._cosh_result_type, ti._cosh, _cosh_docstring +) +del _cosh_docstring + +# B08: ==== DIVIDE (x1, x2) +_divide_docstring_ = r""" +divide(x1, x2, /, \*, out=None, order='K') + +Calculates the ratio for each element `x1_i` of the input array `x1` with +the respective element `x2_i` of the input array `x2`. + +Args: + x1 (usm_ndarray): + First input array, expected to have a floating-point data type. + x2 (usm_ndarray): + Second input array, also expected to have a floating-point data type. + out (Union[usm_ndarray, None], optional): + Output array to populate. + Array must have the correct shape and the expected data type. + order ("C","F","A","K", optional): + Memory layout of the new output array, if parameter + `out` is ``None``. + Default: "K". + +Returns: + usm_ndarray: + An array containing the result of element-wise division. The data type + of the returned array is determined by the Type Promotion Rules. +""" + +divide = BinaryElementwiseFunc( + "divide", + ti._divide_result_type, + ti._divide, + _divide_docstring_, + binary_inplace_fn=ti._divide_inplace, + acceptance_fn=_acceptance_fn_divide, + weak_type_resolver=_resolve_weak_types_all_py_ints, +) +del _divide_docstring_ + +# B09: ==== EQUAL (x1, x2) +_equal_docstring_ = r""" +equal(x1, x2, /, \*, out=None, order='K') + +Calculates equality test results for each element `x1_i` of the input array `x1` +with the respective element `x2_i` of the input array `x2`. + +Args: + x1 (usm_ndarray): + First input array. May have any data type. + x2 (usm_ndarray): + Second input array. May have any data type. + out (Union[usm_ndarray, None], optional): + Output array to populate. + Array must have the correct shape and the expected data type. + order ("C","F","A","K", optional): + Memory layout of the new output array, if parameter + `out` is ``None``. + Default: "K". + +Returns: + usm_ndarray: + An array containing the result of element-wise equality comparison. + The returned array has a data type of `bool`. +""" + +equal = BinaryElementwiseFunc( + "equal", + ti._equal_result_type, + ti._equal, + _equal_docstring_, + weak_type_resolver=_resolve_weak_types_all_py_ints, +) +del _equal_docstring_ + +# U13: ==== EXP (x) +_exp_docstring = r""" +exp(x, /, \*, out=None, order='K') + +Computes the exponential for each element `x_i` of input array `x`. + +Args: + x (usm_ndarray): + Input array, expected to have a floating-point data type. + out (Union[usm_ndarray, None], optional): + Output array to populate. + Array must have the correct shape and the expected data type. + order ("C","F","A","K", optional): + Memory layout of the new output array, if parameter + `out` is ``None``. + Default: "K". + +Returns: + usm_ndarray: + An array containing the element-wise exponential of `x`. + The data type of the returned array is determined by + the Type Promotion Rules. +""" + +exp = UnaryElementwiseFunc("exp", ti._exp_result_type, ti._exp, _exp_docstring) +del _exp_docstring + +# B10: ==== FLOOR_DIVIDE (x1, x2) +_floor_divide_docstring_ = r""" +floor_divide(x1, x2, /, \*, out=None, order='K') + +Calculates the ratio for each element `x1_i` of the input array `x1` with +the respective element `x2_i` of the input array `x2` to the greatest +integer-value number that is not greater than the division result. + +Args: + x1 (usm_ndarray): + First input array, expected to have a real-valued data type. + x2 (usm_ndarray): + Second input array, also expected to have a real-valued data type. + out (Union[usm_ndarray, None], optional): + Output array to populate. + Array must have the correct shape and the expected data type. + order ("C","F","A","K", optional): + Memory layout of the new output array, if parameter + `out` is ``None``. + Default: "K". + +Returns: + usm_ndarray: + An array containing the result of element-wise floor of division. + The data type of the returned array is determined by the Type + Promotion Rules. +""" + +floor_divide = BinaryElementwiseFunc( + "floor_divide", + ti._floor_divide_result_type, + ti._floor_divide, + _floor_divide_docstring_, + binary_inplace_fn=ti._floor_divide_inplace, +) +del _floor_divide_docstring_ + +# B11: ==== GREATER (x1, x2) +_greater_docstring_ = r""" +greater(x1, x2, /, \*, out=None, order='K') + +Computes the greater-than test results for each element `x1_i` of +the input array `x1` with the respective element `x2_i` of the input array `x2`. + +Args: + x1 (usm_ndarray): + First input array. May have any data type. + x2 (usm_ndarray): + Second input array. May have any data type. + out (Union[usm_ndarray, None], optional): + Output array to populate. + Array must have the correct shape and the expected data type. + order ("C","F","A","K", optional): + Memory layout of the new output array, if parameter + `out` is ``None``. + Default: "K". + +Returns: + usm_ndarray: + An array containing the result of element-wise greater-than comparison. + The returned array has a data type of `bool`. +""" + +greater = BinaryElementwiseFunc( + "greater", + ti._greater_result_type, + ti._greater, + _greater_docstring_, + weak_type_resolver=_resolve_weak_types_all_py_ints, +) +del _greater_docstring_ + +# B12: ==== GREATER_EQUAL (x1, x2) +_greater_equal_docstring_ = r""" +greater_equal(x1, x2, /, \*, out=None, order='K') + +Computes the greater-than or equal-to test results for each element `x1_i` of +the input array `x1` with the respective element `x2_i` of the input array `x2`. + +Args: + x1 (usm_ndarray): + First input array. May have any data type. + x2 (usm_ndarray): + Second input array. May have any data type. + out (Union[usm_ndarray, None], optional): + Output array to populate. + Array must have the correct shape and the expected data type. + order ("C","F","A","K", optional): + Memory layout of the new output array, if parameter + `out` is ``None``. + Default: "K". + +Returns: + usm_ndarray: + An array containing the result of element-wise greater-than or equal-to + comparison. + The returned array has a data type of `bool`. +""" + +greater_equal = BinaryElementwiseFunc( + "greater_equal", + ti._greater_equal_result_type, + ti._greater_equal, + _greater_equal_docstring_, + weak_type_resolver=_resolve_weak_types_all_py_ints, +) +del _greater_equal_docstring_ + +# U14: ==== EXPM1 (x) +_expm1_docstring = r""" +expm1(x, /, \*, out=None, order='K') + +Computes the exponential minus 1 for each element `x_i` of input array `x`. + +This function calculates `exp(x) - 1.0` more accurately for small values of `x`. + +Args: + x (usm_ndarray): + Input array, expected to have a floating-point data type. + out (usm_ndarray): + Output array to populate. Array must have the correct + shape and the expected data type. + order ("C","F","A","K", optional): memory layout of the new + output array, if parameter `out` is ``None``. + Default: "K". + +Returns: + usm_ndarray: + An array containing the element-wise `exp(x) - 1` results. + The data type of the returned array is determined by the Type + Promotion Rules. +""" + +expm1 = UnaryElementwiseFunc( + "expm1", ti._expm1_result_type, ti._expm1, _expm1_docstring +) +del _expm1_docstring + +# U15: ==== FLOOR (x) +_floor_docstring = r""" +floor(x, /, \*, out=None, order='K') + +Returns the floor for each element `x_i` for input array `x`. + +The floor of `x_i` is the largest integer `n`, such that `n <= x_i`. + +Args: + x (usm_ndarray): + Input array, expected to have a boolean or real-valued data type. + out (Union[usm_ndarray, None], optional): + Output array to populate. + Array must have the correct shape and the expected data type. + order ("C","F","A","K", optional): + Memory layout of the new output array, if parameter + `out` is ``None``. + Default: "K". + +Returns: + usm_ndarray: + An array containing the element-wise floor. +""" + +floor = UnaryElementwiseFunc( + "floor", ti._floor_result_type, ti._floor, _floor_docstring +) +del _floor_docstring + +# U16: ==== IMAG (x) +_imag_docstring = r""" +imag(x, /, \*, out=None, order='K') + +Computes imaginary part of each element `x_i` for input array `x`. + +Args: + x (usm_ndarray): + Input array. May have any data type. + out (Union[usm_ndarray, None], optional): + Output array to populate. + Array must have the correct shape and the expected data type. + order ("C","F","A","K", optional): + Memory layout of the new output array, if parameter + `out` is ``None``. + Default: "K". + +Returns: + usm_ndarray: + An array containing the element-wise imaginary component of input. + If the input is a real-valued data type, the returned array has + the same data type. If the input is a complex floating-point + data type, the returned array has a floating-point data type + with the same floating-point precision as complex input. +""" + +imag = UnaryElementwiseFunc( + "imag", ti._imag_result_type, ti._imag, _imag_docstring +) +del _imag_docstring + +# U17: ==== ISFINITE (x) +_isfinite_docstring_ = r""" +isfinite(x, /, \*, out=None, order='K') + +Test if each element of input array is a finite number. + +Args: + x (usm_ndarray): + Input array. May have any data type. + out (Union[usm_ndarray, None], optional): + Output array to populate. + Array must have the correct shape and the expected data type. + order ("C","F","A","K", optional): + Memory layout of the new output array, if parameter + `out` is ``None``. + Default: "K". + +Returns: + usm_ndarray: + An array which is True where `x` is not positive infinity, + negative infinity, or NaN, False otherwise. + The data type of the returned array is `bool`. +""" + +isfinite = UnaryElementwiseFunc( + "isfinite", ti._isfinite_result_type, ti._isfinite, _isfinite_docstring_ +) +del _isfinite_docstring_ + +# U18: ==== ISINF (x) +_isinf_docstring_ = r""" +isinf(x, /, \*, out=None, order='K') + +Test if each element of input array is an infinity. + +Args: + x (usm_ndarray): + Input array. May have any data type. + out (Union[usm_ndarray, None], optional): + Output array to populate. + Array must have the correct shape and the expected data type. + order ("C","F","A","K", optional): + Memory layout of the new output array, if parameter + `out` is ``None``. + Default: "K". + +Returns: + usm_ndarray: + An array which is True where `x` is positive or negative infinity, + False otherwise. The data type of the returned array is `bool`. +""" + +isinf = UnaryElementwiseFunc( + "isinf", ti._isinf_result_type, ti._isinf, _isinf_docstring_ +) +del _isinf_docstring_ + +# U19: ==== ISNAN (x) +_isnan_docstring_ = r""" +isnan(x, /, \*, out=None, order='K') + +Test if each element of an input array is a NaN. + +Args: + x (usm_ndarray): + Input array. May have any data type. + out (Union[usm_ndarray, None], optional): + Output array to populate. + Array must have the correct shape and the expected data type. + order ("C","F","A","K", optional): + Memory layout of the new output array, if parameter + `out` is ``None``. + Default: "K". + +Returns: + usm_ndarray: + An array which is True where x is NaN, False otherwise. + The data type of the returned array is `bool`. +""" + +isnan = UnaryElementwiseFunc( + "isnan", ti._isnan_result_type, ti._isnan, _isnan_docstring_ +) +del _isnan_docstring_ + +# B13: ==== LESS (x1, x2) +_less_docstring_ = r""" +less(x1, x2, /, \*, out=None, order='K') + +Computes the less-than test results for each element `x1_i` of +the input array `x1` with the respective element `x2_i` of the input array `x2`. + +Args: + x1 (usm_ndarray): + First input array. May have any data type. + x2 (usm_ndarray): + Second input array. May have any data type. + out (Union[usm_ndarray, None], optional): + Output array to populate. + Array must have the correct shape and the expected data type. + order ("C","F","A","K", optional): + Memory layout of the new output array, if parameter + `out` is ``None``. + Default: "K". + +Returns: + usm_ndarray: + An array containing the result of element-wise less-than comparison. + The returned array has a data type of `bool`. +""" + +less = BinaryElementwiseFunc( + "less", + ti._less_result_type, + ti._less, + _less_docstring_, + weak_type_resolver=_resolve_weak_types_all_py_ints, +) +del _less_docstring_ + + +# B14: ==== LESS_EQUAL (x1, x2) +_less_equal_docstring_ = r""" +less_equal(x1, x2, /, \*, out=None, order='K') + +Computes the less-than or equal-to test results for each element `x1_i` of +the input array `x1` with the respective element `x2_i` of the input array `x2`. + +Args: + x1 (usm_ndarray): + First input array. May have any data type. + x2 (usm_ndarray): + Second input array. May have any data type. + out (Union[usm_ndarray, None], optional): + Output array to populate. + Array must have the correct shape and the expected data type. + order ("C","F","A","K", optional): + Memory layout of the new output array, if parameter + `out` is ``None``. + Default: "K". + +Returns: + usm_ndarray: + An array containing the result of element-wise less-than or equal-to + comparison. The returned array has a data type of `bool`. +""" + +less_equal = BinaryElementwiseFunc( + "less_equal", + ti._less_equal_result_type, + ti._less_equal, + _less_equal_docstring_, + weak_type_resolver=_resolve_weak_types_all_py_ints, +) +del _less_equal_docstring_ + +# U20: ==== LOG (x) +_log_docstring = r""" +log(x, /, \*, out=None, order='K') + +Computes the natural logarithm for each element `x_i` of input array `x`. + +Args: + x (usm_ndarray): + Input array, expected to have a floating-point data type. + out (usm_ndarray): + Output array to populate. Array must have the correct + shape and the expected data type. + order ("C","F","A","K", optional): memory layout of the new + output array, if parameter `out` is ``None``. + Default: "K". + +Returns: + usm_ndarray: + An array containing the element-wise natural logarithm values. + The data type of the returned array is determined by the Type + Promotion Rules. +""" + +log = UnaryElementwiseFunc("log", ti._log_result_type, ti._log, _log_docstring) +del _log_docstring + +# U21: ==== LOG1P (x) +_log1p_docstring = r""" +log1p(x, /, \*, out=None, order='K') + +Computes the natural logarithm of (1 + `x`) for each element `x_i` of input +array `x`. + +This function calculates `log(1 + x)` more accurately for small values of `x`. + +Args: + x (usm_ndarray): + Input array, expected to have a floating-point data type. + out (usm_ndarray): + Output array to populate. Array must have the correct + shape and the expected data type. + order ("C","F","A","K", optional): memory layout of the new + output array, if parameter `out` is ``None``. + Default: "K". + +Returns: + usm_ndarray: + An array containing the element-wise `log(1 + x)` results. The data type + of the returned array is determined by the Type Promotion Rules. +""" + +log1p = UnaryElementwiseFunc( + "log1p", ti._log1p_result_type, ti._log1p, _log1p_docstring +) +del _log1p_docstring + +# U22: ==== LOG2 (x) +_log2_docstring_ = r""" +log2(x, /, \*, out=None, order='K') + +Computes the base-2 logarithm for each element `x_i` of input array `x`. + +Args: + x (usm_ndarray): + Input array, expected to have a floating-point data type. + out (Union[usm_ndarray, None], optional): + Output array to populate. + Array must have the correct shape and the expected data type. + order ("C","F","A","K", optional): + Memory layout of the new output array, if parameter + `out` is ``None``. + Default: "K". + +Returns: + usm_ndarray: + An array containing the element-wise base-2 logarithm of `x`. + The data type of the returned array is determined by the + Type Promotion Rules. +""" + +log2 = UnaryElementwiseFunc( + "log2", ti._log2_result_type, ti._log2, _log2_docstring_ +) +del _log2_docstring_ + +# U23: ==== LOG10 (x) +_log10_docstring_ = r""" +log10(x, /, \*, out=None, order='K') + +Computes the base-10 logarithm for each element `x_i` of input array `x`. + +Args: + x (usm_ndarray): + Input array, expected to have a floating-point data type. + out (Union[usm_ndarray, None], optional): + Output array to populate. + Array must have the correct shape and the expected data type. + order ("C","F","A","K", optional): + Memory layout of the new output array, if parameter + `out` is ``None``. + Default: `"K"`. + +Returns: + usm_ndarray: + An array containing the element-wise base-10 logarithm of `x`. + The data type of the returned array is determined by the + Type Promotion Rules. +""" + +log10 = UnaryElementwiseFunc( + "log10", ti._log10_result_type, ti._log10, _log10_docstring_ +) +del _log10_docstring_ + +# B15: ==== LOGADDEXP (x1, x2) +_logaddexp_docstring_ = r""" +logaddexp(x1, x2, /, \*, out=None, order='K') + +Calculates the natural logarithm of the sum of exponentials for each element +`x1_i` of the input array `x1` with the respective element `x2_i` of the input +array `x2`. + +This function calculates `log(exp(x1) + exp(x2))` more accurately for small +values of `x`. + +Args: + x1 (usm_ndarray): + First input array, expected to have a real-valued floating-point data + type. + x2 (usm_ndarray): + Second input array, also expected to have a real-valued floating-point + data type. + out (Union[usm_ndarray, None], optional): + Output array to populate. + Array must have the correct shape and the expected data type. + order ("C","F","A","K", optional): + Memory layout of the new output array, if parameter + `out` is ``None``. + Default: "K". + +Returns: + usm_ndarray: + An array containing the element-wise results. The data type + of the returned array is determined by the Type Promotion Rules. +""" + +logaddexp = BinaryElementwiseFunc( + "logaddexp", ti._logaddexp_result_type, ti._logaddexp, _logaddexp_docstring_ +) +del _logaddexp_docstring_ + +# B16: ==== LOGICAL_AND (x1, x2) +_logical_and_docstring_ = r""" +logical_and(x1, x2, /, \*, out=None, order='K') + +Computes the logical AND for each element `x1_i` of the input array `x1` with +the respective element `x2_i` of the input array `x2`. + +Args: + x1 (usm_ndarray): + First input array. May have any data type. + x2 (usm_ndarray): + Second input array. May have any data type. + out (Union[usm_ndarray, None], optional): + Output array to populate. + Array must have the correct shape and the expected data type. + order ("C","F","A","K", optional): + Memory layout of the new output array, if parameter + `out` is ``None``. + Default: "K". + +Returns: + usm_ndarray: + An array containing the element-wise logical AND results. +""" +logical_and = BinaryElementwiseFunc( + "logical_and", + ti._logical_and_result_type, + ti._logical_and, + _logical_and_docstring_, +) +del _logical_and_docstring_ + +# B17: ==== LOGICAL_OR (x1, x2) +_logical_or_docstring_ = r""" +logical_or(x1, x2, /, \*, out=None, order='K') + +Computes the logical OR for each element `x1_i` of the input array `x1` +with the respective element `x2_i` of the input array `x2`. + +Args: + x1 (usm_ndarray): + First input array. May have any data type. + x2 (usm_ndarray): + Second input array. May have any data type. + out (Union[usm_ndarray, None], optional): + Output array to populate. + Array must have the correct shape and the expected data type. + order ("C","F","A","K", optional): + Memory layout of the new output array, if parameter + `out` is ``None``. + Default: "K". + +Returns: + usm_ndarray: + An array containing the element-wise logical OR results. +""" +logical_or = BinaryElementwiseFunc( + "logical_or", + ti._logical_or_result_type, + ti._logical_or, + _logical_or_docstring_, +) +del _logical_or_docstring_ + +# B18: ==== LOGICAL_XOR (x1, x2) +_logical_xor_docstring_ = r""" +logical_xor(x1, x2, /, \*, out=None, order='K') + +Computes the logical XOR for each element `x1_i` of the input array `x1` +with the respective element `x2_i` of the input array `x2`. + +Args: + x1 (usm_ndarray): + First input array. May have any data type. + x2 (usm_ndarray): + Second input array. May have any data type. + out (Union[usm_ndarray, None], optional): + Output array to populate. + Array must have the correct shape and the expected data type. + order ("C","F","A","K", optional): + Memory layout of the new output array, if parameter + `out` is ``None``. + Default: "K". + +Returns: + usm_ndarray: + An array containing the element-wise logical XOR results. +""" +logical_xor = BinaryElementwiseFunc( + "logical_xor", + ti._logical_xor_result_type, + ti._logical_xor, + _logical_xor_docstring_, +) +del _logical_xor_docstring_ + +# U24: ==== LOGICAL_NOT (x) +_logical_not_docstring = r""" +logical_not(x, /, \*, out=None, order='K') + +Computes the logical NOT for each element `x_i` of input array `x`. + +Args: + x (usm_ndarray): + Input array. May have any data type. + out (usm_ndarray): + Output array to populate. Array must have the correct + shape and the expected data type. + order ("C","F","A","K", optional): memory layout of the new + output array, if parameter `out` is ``None``. + Default: "K". + +Returns: + usm_ndarray: + An array containing the element-wise logical NOT results. +""" + +logical_not = UnaryElementwiseFunc( + "logical_not", + ti._logical_not_result_type, + ti._logical_not, + _logical_not_docstring, +) +del _logical_not_docstring + +# B26: ==== MAXIMUM (x1, x2) +_maximum_docstring_ = r""" +maximum(x1, x2, /, \*, out=None, order='K') + +Compares two input arrays `x1` and `x2` and returns a new array containing the +element-wise maxima. + +Args: + x1 (usm_ndarray): + First input array. May have any data type. + x2 (usm_ndarray): + Second input array. May have any data type. + out (Union[usm_ndarray, None], optional): + Output array to populate. + Array must have the correct shape and the expected data type. + order ("C","F","A","K", optional): + Memory layout of the new output array, if parameter + `out` is ``None``. + Default: "K". + +Returns: + usm_ndarray: + An array containing the element-wise maxima. The data type of + the returned array is determined by the Type Promotion Rules. +""" +maximum = BinaryElementwiseFunc( + "maximum", + ti._maximum_result_type, + ti._maximum, + _maximum_docstring_, +) +del _maximum_docstring_ + +# B27: ==== MINIMUM (x1, x2) +_minimum_docstring_ = r""" +minimum(x1, x2, /, \*, out=None, order='K') + +Compares two input arrays `x1` and `x2` and returns a new array containing the +element-wise minima. + +Args: + x1 (usm_ndarray): + First input array. May have any data type. + x2 (usm_ndarray): + Second input array. May have any data type. + out (Union[usm_ndarray, None], optional): + Output array to populate. + Array must have the correct shape and the expected data type. + order ("C","F","A","K", optional): + Memory layout of the new output array, if parameter + `out` is ``None``. + Default: "K". + +Returns: + usm_ndarray: + An array containing the element-wise minima. The data type of + the returned array is determined by the Type Promotion Rules. +""" +minimum = BinaryElementwiseFunc( + "minimum", + ti._minimum_result_type, + ti._minimum, + _minimum_docstring_, +) +del _minimum_docstring_ + +# B19: ==== MULTIPLY (x1, x2) +_multiply_docstring_ = r""" +multiply(x1, x2, /, \*, out=None, order='K') + +Calculates the product for each element `x1_i` of the input array `x1` with the +respective element `x2_i` of the input array `x2`. + +Args: + x1 (usm_ndarray): + First input array. May have any data type. + x2 (usm_ndarray): + Second input array. May have any data type. + out (Union[usm_ndarray, None], optional): + Output array to populate. + Array must have the correct shape and the expected data type. + order ("C","F","A","K", optional): + Memory layout of the new output array, if parameter + `out` is ``None``. + Default: "K". + +Returns: + usm_ndarray: + An array containing the element-wise products. The data type of + the returned array is determined by the Type Promotion Rules. +""" +multiply = BinaryElementwiseFunc( + "multiply", + ti._multiply_result_type, + ti._multiply, + _multiply_docstring_, + binary_inplace_fn=ti._multiply_inplace, +) +del _multiply_docstring_ + +# U25: ==== NEGATIVE (x) +_negative_docstring_ = r""" +negative(x, /, \*, out=None, order='K') + +Computes the numerical negative for each element `x_i` of input array `x`. + +Args: + x (usm_ndarray): + Input array, expected to have a numeric data type. + out (usm_ndarray): + Output array to populate. Array must have the correct + shape and the expected data type. + order ("C","F","A","K", optional): memory layout of the new + output array, if parameter `out` is ``None``. + Default: "K". + +Returns: + usm_ndarray: + An array containing the negative of `x`. +""" + +negative = UnaryElementwiseFunc( + "negative", + ti._negative_result_type, + ti._negative, + _negative_docstring_, + acceptance_fn=_acceptance_fn_negative, +) +del _negative_docstring_ + +# B28: ==== NEXTAFTER (x1, x2) +_nextafter_docstring_ = r""" +nextafter(x1, x2, /, \*, out=None, order='K') + +Calculates the next floating-point value after element `x1_i` of the input +array `x1` toward the respective element `x2_i` of the input array `x2`. + +Args: + x1 (usm_ndarray): + First input array, expected to have a real-valued floating-point data + type. + x2 (usm_ndarray): + Second input array, expected to have a real-valued floating-point data + type. + out (Union[usm_ndarray, None], optional): + Output array to populate. + Array must have the correct shape and the expected data type. + order ("C","F","A","K", optional): + Memory layout of the new output array, if parameter + `out` is ``None``. + Default: "K". + +Returns: + usm_ndarray: + An array containing the element-wise next representable values of `x1` + in the direction of `x2`. The data type of the returned array is + determined by the Type Promotion Rules. +""" +nextafter = BinaryElementwiseFunc( + "nextafter", + ti._nextafter_result_type, + ti._nextafter, + _nextafter_docstring_, +) +del _nextafter_docstring_ + +# B20: ==== NOT_EQUAL (x1, x2) +_not_equal_docstring_ = r""" +not_equal(x1, x2, /, \*, out=None, order='K') + +Calculates inequality test results for each element `x1_i` of the +input array `x1` with the respective element `x2_i` of the input array `x2`. + +Args: + x1 (usm_ndarray): + First input array. + x2 (usm_ndarray): + Second input array. + out (Union[usm_ndarray, None], optional): + Output array to populate. + Array must have the correct shape and the expected data type. + order ("C","F","A","K", optional): + Memory layout of the new output array, if parameter + `out` is ``None``. + Default: "K". + +Returns: + usm_ndarray: + An array containing the result of element-wise inequality comparison. + The returned array has a data type of `bool`. +""" + +not_equal = BinaryElementwiseFunc( + "not_equal", + ti._not_equal_result_type, + ti._not_equal, + _not_equal_docstring_, + weak_type_resolver=_resolve_weak_types_all_py_ints, +) +del _not_equal_docstring_ + +# U26: ==== POSITIVE (x) +_positive_docstring_ = r""" +positive(x, /, \*, out=None, order='K') + +Computes the numerical positive for each element `x_i` of input array `x`. + +Args: + x (usm_ndarray): + Input array, expected to have a numeric data type. + out (usm_ndarray): + Output array to populate. Array must have the correct + shape and the expected data type. + order ("C","F","A","K", optional): memory layout of the new + output array, if parameter `out` is ``None``. + Default: "K". + +Returns: + usm_ndarray: + An array containing the positive of `x`. +""" + +positive = UnaryElementwiseFunc( + "positive", ti._positive_result_type, ti._positive, _positive_docstring_ +) +del _positive_docstring_ + +# B21: ==== POW (x1, x2) +_pow_docstring_ = r""" +pow(x1, x2, /, \*, out=None, order='K') + +Calculates `x1_i` raised to `x2_i` for each element `x1_i` of the input array +`x1` with the respective element `x2_i` of the input array `x2`. + +Args: + x1 (usm_ndarray): + First input array, expected to have a numeric data type. + x2 (usm_ndarray): + Second input array, also expected to have a numeric data type. + out (usm_ndarray): + Output array to populate. Array must have the correct + shape and the expected data type. + order ("C","F","A","K", optional): memory layout of the new + output array, if parameter `out` is ``None``. + Default: "K". + +Returns: + usm_ndarray: + An array containing the bases in `x1` raised to the exponents in `x2` + element-wise. The data type of the returned array is determined by the + Type Promotion Rules. +""" +pow = BinaryElementwiseFunc( + "pow", + ti._pow_result_type, + ti._pow, + _pow_docstring_, + binary_inplace_fn=ti._pow_inplace, +) +del _pow_docstring_ + +# U27: ==== REAL (x) +_real_docstring = r""" +real(x, /, \*, out=None, order='K') + +Computes real part of each element `x_i` for input array `x`. + +Args: + x (usm_ndarray): + Input array. May have any data type. + out (Union[usm_ndarray, None], optional): + Output array to populate. + Array must have the correct shape and the expected data type. + order ("C","F","A","K", optional): + Memory layout of the new output array, if parameter + `out` is ``None``. + Default: "K". + +Returns: + usm_ndarray: + An array containing the element-wise real component of input. + If the input is a real-valued data type, the returned array has + the same data type. If the input is a complex floating-point + data type, the returned array has a floating-point data type + with the same floating-point precision as complex input. +""" + +real = UnaryElementwiseFunc( + "real", ti._real_result_type, ti._real, _real_docstring +) +del _real_docstring + +# B22: ==== REMAINDER (x1, x2) +_remainder_docstring_ = r""" +remainder(x1, x2, /, \*, out=None, order='K') + +Calculates the remainder of division for each element `x1_i` of the input array +`x1` with the respective element `x2_i` of the input array `x2`. + +This function is equivalent to the Python modulus operator. + +Args: + x1 (usm_ndarray): + First input array, expected to have a real-valued data type. + x2 (usm_ndarray): + Second input array, also expected to have a real-valued data type. + out (Union[usm_ndarray, None], optional): + Output array to populate. + Array must have the correct shape and the expected data type. + order ("C","F","A","K", optional): + Memory layout of the new output array, if parameter + `out` is ``None``. + Default: "K". + +Returns: + usm_ndarray: + An array containing the element-wise remainders. Each remainder has the + same sign as respective element `x2_i`. The data type of the returned + array is determined by the Type Promotion Rules. +""" +remainder = BinaryElementwiseFunc( + "remainder", + ti._remainder_result_type, + ti._remainder, + _remainder_docstring_, + binary_inplace_fn=ti._remainder_inplace, +) +del _remainder_docstring_ + +# U28: ==== ROUND (x) +_round_docstring = r""" +round(x, /, \*, out=None, order='K') + +Rounds each element `x_i` of the input array `x` to +the nearest integer-valued number. + +When two integers are equally close to `x_i`, the result is the nearest even +integer to `x_i`. + +Args: + x (usm_ndarray): + Input array, expected to have a numeric data type. + out (Union[usm_ndarray, None], optional): + Output array to populate. + Array must have the correct shape and the expected data type. + order ("C","F","A","K", optional): + Memory layout of the new output array, if parameter + `out` is ``None``. + Default: "K". + +Returns: + usm_ndarray: + An array containing the element-wise rounded values. +""" + +round = UnaryElementwiseFunc( + "round", + ti._round_result_type, + ti._round, + _round_docstring, + acceptance_fn=_acceptance_fn_round, +) +del _round_docstring + +# U29: ==== SIGN (x) +_sign_docstring = r""" +sign(x, /, \*, out=None, order='K') + +Computes an indication of the sign of each element `x_i` of input array `x` +using the signum function. + +The signum function returns `-1` if `x_i` is less than `0`, +`0` if `x_i` is equal to `0`, and `1` if `x_i` is greater than `0`. + +Args: + x (usm_ndarray): + Input array, expected to have a numeric data type. + out (Union[usm_ndarray, None], optional): + Output array to populate. + Array must have the correct shape and the expected data type. + order ("C","F","A","K", optional): + Memory layout of the new output array, if parameter + `out` is ``None``. + Default: "K". + +Returns: + usm_ndarray: + An array containing the element-wise result of the signum function. The + data type of the returned array is determined by the Type Promotion + Rules. +""" + +sign = UnaryElementwiseFunc( + "sign", ti._sign_result_type, ti._sign, _sign_docstring +) +del _sign_docstring + +# U30: ==== SIN (x) +_sin_docstring = r""" +sin(x, /, \*, out=None, order='K') + +Computes sine for each element `x_i` of input array `x`. + +Args: + x (usm_ndarray): + Input array, expected to have a real-valued floating-point data type. + out (Union[usm_ndarray, None], optional): + Output array to populate. + Array must have the correct shape and the expected data type. + order ("C","F","A","K", optional): + Memory layout of the new output array, if parameter + `out` is ``None``. + Default: "K". + +Returns: + usm_ndarray: + An array containing the element-wise sine. The data type of the + returned array is determined by the Type Promotion Rules. +""" + +sin = UnaryElementwiseFunc("sin", ti._sin_result_type, ti._sin, _sin_docstring) +del _sin_docstring + +# U31: ==== SINH (x) +_sinh_docstring = r""" +sinh(x, /, \*, out=None, order='K') + +Computes hyperbolic sine for each element `x_i` for input array `x`. + +Args: + x (usm_ndarray): + Input array, expected to have a floating-point data type. + out (Union[usm_ndarray, None], optional): + Output array to populate. + Array must have the correct shape and the expected data type. + order ("C","F","A","K", optional): + Memory layout of the new output array, if parameter + `out` is ``None``. + Default: "K". + +Returns: + usm_ndarray: + An array containing the element-wise hyperbolic sine. The data type + of the returned array is determined by the Type Promotion Rules. +""" + +sinh = UnaryElementwiseFunc( + "sinh", ti._sinh_result_type, ti._sinh, _sinh_docstring +) +del _sinh_docstring + +# U32: ==== SQUARE (x) +_square_docstring_ = r""" +square(x, /, \*, out=None, order='K') + +Squares each element `x_i` of input array `x`. + +Args: + x (usm_ndarray): + Input array. May have any data type. + out (Union[usm_ndarray, None], optional): + Output array to populate. + Array must have the correct shape and the expected data type. + order ("C","F","A","K", optional): + Memory layout of the new output array, if parameter + `out` is ``None``. + Default: "K". + +Returns: + usm_ndarray: + An array containing the element-wise squares of `x`. The data type of + the returned array is determined by the Type Promotion Rules. +""" + +square = UnaryElementwiseFunc( + "square", ti._square_result_type, ti._square, _square_docstring_ +) +del _square_docstring_ + +# U33: ==== SQRT (x) +_sqrt_docstring_ = r""" +sqrt(x, /, \*, out=None, order='K') + +Computes the positive square-root for each element `x_i` of input array `x`. + +Args: + x (usm_ndarray): + Input array, expected to have a floating-point data type. + out (Union[usm_ndarray, None], optional): + Output array to populate. + Array must have the correct shape and the expected data type. + order ("C","F","A","K", optional): + Memory layout of the new output array, if parameter + `out` is ``None``. + Default: "K". + +Returns: + usm_ndarray: + An array containing the element-wise positive square-roots of `x`. The + data type of the returned array is determined by the Type Promotion + Rules. +""" + +sqrt = UnaryElementwiseFunc( + "sqrt", ti._sqrt_result_type, ti._sqrt, _sqrt_docstring_ +) +del _sqrt_docstring_ + +# B23: ==== SUBTRACT (x1, x2) +_subtract_docstring_ = r""" +subtract(x1, x2, /, \*, out=None, order='K') + +Calculates the difference between each element `x1_i` of the input +array `x1` and the respective element `x2_i` of the input array `x2`. + +Args: + x1 (usm_ndarray): + First input array, expected to have a numeric data type. + x2 (usm_ndarray): + Second input array, also expected to have a numeric data type. + out (Union[usm_ndarray, None], optional): + Output array to populate. + Array must have the correct shape and the expected data type. + order ("C","F","A","K", optional): + Memory layout of the new output array, if parameter + `out` is ``None``. + Default: "K". + +Returns: + usm_ndarray: + An array containing the element-wise differences. The data type + of the returned array is determined by the Type Promotion Rules. +""" +subtract = BinaryElementwiseFunc( + "subtract", + ti._subtract_result_type, + ti._subtract, + _subtract_docstring_, + binary_inplace_fn=ti._subtract_inplace, + acceptance_fn=_acceptance_fn_subtract, +) +del _subtract_docstring_ + +# U34: ==== TAN (x) +_tan_docstring = r""" +tan(x, /, \*, out=None, order='K') + +Computes tangent for each element `x_i` for input array `x`. + +Args: + x (usm_ndarray): + Input array, expected to have a floating-point data type. + out (Union[usm_ndarray, None], optional): + Output array to populate. + Array must have the correct shape and the expected data type. + order ("C","F","A","K", optional): + Memory layout of the new output array, if parameter + `out` is ``None``. + Default: "K". + +Returns: + usm_ndarray: + An array containing the element-wise tangent. The data type + of the returned array is determined by the Type Promotion Rules. +""" + +tan = UnaryElementwiseFunc("tan", ti._tan_result_type, ti._tan, _tan_docstring) +del _tan_docstring + +# U35: ==== TANH (x) +_tanh_docstring = r""" +tanh(x, /, \*, out=None, order='K') + +Computes hyperbolic tangent for each element `x_i` for input array `x`. + +Args: + x (usm_ndarray): + Input array, expected to have a floating-point data type. + out (Union[usm_ndarray, None], optional): + Output array to populate. + Array must have the correct shape and the expected data type. + order ("C","F","A","K", optional): + Memory layout of the new output array, if parameter + `out` is ``None``. + Default: "K". + +Returns: + usm_ndarray: + An array containing the element-wise hyperbolic tangent. The data type + of the returned array is determined by the Type Promotion Rules. +""" + +tanh = UnaryElementwiseFunc( + "tanh", ti._tanh_result_type, ti._tanh, _tanh_docstring +) +del _tanh_docstring + +# U36: ==== TRUNC (x) +_trunc_docstring = r""" +trunc(x, /, \*, out=None, order='K') + +Returns the truncated value for each element `x_i` for input array `x`. + +The truncated value of the scalar `x` is the nearest integer i which is +closer to zero than `x` is. In short, the fractional part of the +signed number `x` is discarded. + +Args: + x (usm_ndarray): + Input array, expected to have a boolean or real-valued data type. + out (Union[usm_ndarray, None], optional): + Output array to populate. + Array must have the correct shape and the expected data type. + order ("C","F","A","K", optional): + Memory layout of the new output array, if parameter + `out` is ``None``. + Default: "K". + +Returns: + usm_ndarray: + An array containing the result of element-wise division. The data type + of the returned array is determined by the Type Promotion Rules. +""" +trunc = UnaryElementwiseFunc( + "trunc", ti._trunc_result_type, ti._trunc, _trunc_docstring +) +del _trunc_docstring + +# B24: ==== HYPOT (x1, x2) +_hypot_docstring_ = r""" +hypot(x1, x2, /, \*, out=None, order='K') + +Computes the square root of the sum of squares for each element `x1_i` of the +input array `x1` with the respective element `x2_i` of the input array `x2`. + +Args: + x1 (usm_ndarray): + First input array, expected to have a real-valued floating-point data + type. + x2 (usm_ndarray): + Second input array, also expected to have a real-valued floating-point + data type. + out (Union[usm_ndarray, None], optional): + Output array to populate. + Array must have the correct shape and the expected data type. + order ("C","F","A","K", optional): + Memory layout of the new output array, if parameter + `out` is ``None``. + Default: "K". + +Returns: + usm_ndarray: + An array containing the element-wise hypotenuse. The data type + of the returned array is determined by the Type Promotion Rules. +""" + +hypot = BinaryElementwiseFunc( + "hypot", ti._hypot_result_type, ti._hypot, _hypot_docstring_ +) +del _hypot_docstring_ + +# U37: ==== CBRT (x) +_cbrt_docstring_ = r""" +cbrt(x, /, \*, out=None, order='K') + +Computes the cube-root for each element `x_i` for input array `x`. + +Args: + x (usm_ndarray): + Input array, expected to have a real-valued floating-point data type. + out (Union[usm_ndarray, None], optional): + Output array to populate. + Array have the correct shape and the expected data type. + order ("C","F","A","K", optional): + Memory layout of the new output array, if parameter + `out` is ``None``. + Default: "K". + +Returns: + usm_ndarray: + An array containing the element-wise cube-root. + The data type of the returned array is determined by + the Type Promotion Rules. +""" + +cbrt = UnaryElementwiseFunc( + "cbrt", ti._cbrt_result_type, ti._cbrt, _cbrt_docstring_ +) +del _cbrt_docstring_ + +# U38: ==== EXP2 (x) +_exp2_docstring_ = r""" +exp2(x, /, \*, out=None, order='K') + +Computes the base-2 exponential for each element `x_i` for input array `x`. + +Args: + x (usm_ndarray): + Input array, expected to have a floating-point data type. + out (Union[usm_ndarray, None], optional): + Output array to populate. + Array have the correct shape and the expected data type. + order ("C","F","A","K", optional): + Memory layout of the new output array, if parameter + `out` is ``None``. + Default: "K". + +Returns: + usm_ndarray: + An array containing the element-wise base-2 exponentials. + The data type of the returned array is determined by + the Type Promotion Rules. +""" + +exp2 = UnaryElementwiseFunc( + "exp2", ti._exp2_result_type, ti._exp2, _exp2_docstring_ +) +del _exp2_docstring_ + +# B25: ==== COPYSIGN (x1, x2) +_copysign_docstring_ = r""" +copysign(x1, x2, /, \*, out=None, order='K') + +Composes a floating-point value with the magnitude of `x1_i` and the sign of +`x2_i` for each element of input arrays `x1` and `x2`. + +Args: + x1 (usm_ndarray): + First input array, expected to have a real-valued floating-point data + type. + x2 (usm_ndarray): + Second input array, also expected to have a real-valued floating-point + data type. + out (Union[usm_ndarray, None], optional): + Output array to populate. + Array have the correct shape and the expected data type. + order ("C","F","A","K", optional): + Memory layout of the new output array, if parameter + `out` is ``None``. + Default: "K". + +Returns: + usm_ndarray: + An array containing the element-wise results. The data type + of the returned array is determined by the Type Promotion Rules. +""" +copysign = BinaryElementwiseFunc( + "copysign", + ti._copysign_result_type, + ti._copysign, + _copysign_docstring_, +) +del _copysign_docstring_ + +# U39: ==== RSQRT (x) +_rsqrt_docstring_ = r""" +rsqrt(x, /, \*, out=None, order='K') + +Computes the reciprocal square-root for each element `x_i` for input array `x`. + +Args: + x (usm_ndarray): + Input array, expected to have a real-valued floating-point data type. + out (Union[usm_ndarray, None], optional): + Output array to populate. + Array have the correct shape and the expected data type. + order ("C","F","A","K", optional): + Memory layout of the new output array, if parameter + `out` is ``None``. + Default: "K". + +Returns: + usm_ndarray: + An array containing the element-wise reciprocal square-root. + The returned array has a floating-point data type determined by + the Type Promotion Rules. +""" + +rsqrt = UnaryElementwiseFunc( + "rsqrt", ti._rsqrt_result_type, ti._rsqrt, _rsqrt_docstring_ +) +del _rsqrt_docstring_ + +# U40: ==== PROJ (x) +_proj_docstring = r""" +proj(x, /, \*, out=None, order='K') + +Computes projection of each element `x_i` for input array `x`. + +Args: + x (usm_ndarray): + Input array, expected to have a complex data type. + out (Union[usm_ndarray, None], optional): + Output array to populate. + Array must have the correct shape and the expected data type. + order ("C","F","A","K", optional): + Memory layout of the new output array, if parameter + `out` is ``None``. + Default: "K". + +Returns: + usm_ndarray: + An array containing the element-wise projection. +""" + +proj = UnaryElementwiseFunc( + "proj", ti._proj_result_type, ti._proj, _proj_docstring +) +del _proj_docstring + +# U41: ==== SIGNBIT (x) +_signbit_docstring = r""" +signbit(x, /, \*, out=None, order='K') + +Computes an indication of whether the sign bit of each element `x_i` of +input array `x` is set. + +Args: + x (usm_ndarray): + Input array, expected to have a real-valued floating-point data type. + out (Union[usm_ndarray, None], optional): + Output array to populate. + Array must have the correct shape and the expected data type. + order ("C","F","A","K", optional): + Memory layout of the new output array, if parameter + `out` is ``None``. + Default: "K". + +Returns: + usm_ndarray: + An array containing the element-wise signbit results. The returned array + must have a data type of `bool`. +""" + +signbit = UnaryElementwiseFunc( + "signbit", ti._signbit_result_type, ti._signbit, _signbit_docstring +) +del _signbit_docstring + +# U42: ==== RECIPROCAL (x) +_reciprocal_docstring = r""" +reciprocal(x, /, \*, out=None, order='K') + +Computes the reciprocal of each element `x_i` for input array `x`. + +Args: + x (usm_ndarray): + Input array, expected to have a floating-point data type. + out (Union[usm_ndarray, None], optional): + Output array to populate. + Array have the correct shape and the expected data type. + order ("C","F","A","K", optional): + Memory layout of the new output array, if parameter + `out` is ``None``. + Default: "K". + +Returns: + usm_ndarray: + An array containing the element-wise reciprocals. + The returned array has a floating-point data type determined + by the Type Promotion Rules. +""" + +reciprocal = UnaryElementwiseFunc( + "reciprocal", + ti._reciprocal_result_type, + ti._reciprocal, + _reciprocal_docstring, + acceptance_fn=_acceptance_fn_reciprocal, +) +del _reciprocal_docstring + +# U43: ==== ANGLE (x) +_angle_docstring = r""" +angle(x, /, \*, out=None, order='K') + +Computes the phase angle (also called the argument) of each element `x_i` for +input array `x`. + +Args: + x (usm_ndarray): + Input array, expected to have a complex floating-point data type. + out (Union[usm_ndarray, None], optional): + Output array to populate. + Array have the correct shape and the expected data type. + order ("C","F","A","K", optional): + Memory layout of the new output array, if parameter + `out` is ``None``. + Default: "K". + +Returns: + usm_ndarray: + An array containing the element-wise phase angles. + The returned array has a floating-point data type determined + by the Type Promotion Rules. +""" + +angle = UnaryElementwiseFunc( + "angle", + ti._angle_result_type, + ti._angle, + _angle_docstring, +) +del _angle_docstring + +del ti diff --git a/dpnp/tensor/_flags.pyx b/dpnp/tensor/_flags.pyx new file mode 100644 index 000000000000..322d52bd56c7 --- /dev/null +++ b/dpnp/tensor/_flags.pyx @@ -0,0 +1,175 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +# distutils: language = c++ +# cython: language_level=3 +# cython: linetrace=True + +from libcpp cimport bool as cpp_bool + +from ._usmarray cimport ( + USM_ARRAY_C_CONTIGUOUS, + USM_ARRAY_F_CONTIGUOUS, + USM_ARRAY_WRITABLE, + usm_ndarray, +) + + +cdef cpp_bool _check_bit(int flag, int mask): + return (flag & mask) == mask + + +cdef class Flags: + """ + Helper class to query the flags of a :class:`dpctl.tensor.usm_ndarray` + instance, which describe how the instance interfaces with its underlying + memory. + """ + cdef int flags_ + cdef usm_ndarray arr_ + + def __cinit__(self, usm_ndarray arr, int flags): + self.arr_ = arr + self.flags_ = flags + + @property + def flags(self): + """ + Integer representation of the memory layout flags of + :class:`dpctl.tensor.usm_ndarray` instance. + """ + return self.flags_ + + @property + def c_contiguous(self): + """ + True if the memory layout of the + :class:`dpctl.tensor.usm_ndarray` instance is C-contiguous. + """ + return _check_bit(self.flags_, USM_ARRAY_C_CONTIGUOUS) + + @property + def f_contiguous(self): + """ + True if the memory layout of the + :class:`dpctl.tensor.usm_ndarray` instance is F-contiguous. + """ + return _check_bit(self.flags_, USM_ARRAY_F_CONTIGUOUS) + + @property + def writable(self): + """ + True if :class:`dpctl.tensor.usm_ndarray` instance is writable. + """ + return _check_bit(self.flags_, USM_ARRAY_WRITABLE) + + @writable.setter + def writable(self, new_val): + if not isinstance(new_val, bool): + raise TypeError("Expecting a boolean value") + self.arr_._set_writable_flag(new_val) + + @property + def fc(self): + """ + True if the memory layout of the :class:`dpctl.tensor.usm_ndarray` + instance is C-contiguous and F-contiguous. + """ + return ( + _check_bit(self.flags_, USM_ARRAY_C_CONTIGUOUS) + and _check_bit(self.flags_, USM_ARRAY_F_CONTIGUOUS) + ) + + @property + def forc(self): + """ + True if the memory layout of the :class:`dpctl.tensor.usm_ndarray` + instance is C-contiguous or F-contiguous. + """ + return ( + _check_bit(self.flags_, USM_ARRAY_C_CONTIGUOUS) + or _check_bit(self.flags_, USM_ARRAY_F_CONTIGUOUS) + ) + + @property + def fnc(self): + """ + True if the memory layout of the :class:`dpctl.tensor.usm_ndarray` + instance is F-contiguous and not C-contiguous. + """ + return ( + _check_bit(self.flags_, USM_ARRAY_F_CONTIGUOUS) + and not _check_bit(self.flags_, USM_ARRAY_C_CONTIGUOUS) + ) + + @property + def contiguous(self): + """ + True if the memory layout of the :class:`dpctl.tensor.usm_ndarray` + instance is C-contiguous and F-contiguous. + Equivalent to `forc.` + """ + return self.forc + + def __getitem__(self, name): + if name in ["C_CONTIGUOUS", "C"]: + return self.c_contiguous + elif name in ["F_CONTIGUOUS", "F"]: + return self.f_contiguous + elif name in ["WRITABLE", "W"]: + return self.writable + elif name == "FC": + return self.fc + elif name == "FNC": + return self.fnc + elif name in ["FORC", "CONTIGUOUS"]: + return self.forc + + def __setitem__(self, name, val): + if name in ["WRITABLE", "W"]: + self.writable = val + else: + raise ValueError( + "Only writable ('W' or 'WRITABLE') flag can be set" + ) + + def __repr__(self): + out = [] + for name in "C_CONTIGUOUS", "F_CONTIGUOUS", "WRITABLE": + out.append(" {} : {}".format(name, self[name])) + return "\n".join(out) + + def __eq__(self, other): + cdef Flags other_ + if isinstance(other, self.__class__): + other_ = other + return self.flags_ == other_.flags_ + elif isinstance(other, int): + return self.flags_ == other + else: + return False diff --git a/dpnp/tensor/_indexing_functions.py b/dpnp/tensor/_indexing_functions.py new file mode 100644 index 000000000000..9ea0a16bdd03 --- /dev/null +++ b/dpnp/tensor/_indexing_functions.py @@ -0,0 +1,633 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import operator + +from dpctl.utils import SequentialOrderManager + +import dpnp.tensor as dpt +import dpnp.tensor._tensor_impl as ti + +from ._copy_utils import ( + _extract_impl, + _nonzero_impl, + _put_multi_index, + _take_multi_index, +) +from ._numpy_helper import normalize_axis_index + + +def _get_indexing_mode(name): + modes = {"wrap": 0, "clip": 1} + try: + return modes[name] + except KeyError: + raise ValueError( + "`mode` must be `wrap` or `clip`." "Got `{}`.".format(name) + ) + + +def _range(sh_i, i, nd, q, usm_t, dt): + ind = dpt.arange(sh_i, dtype=dt, usm_type=usm_t, sycl_queue=q) + ind.shape = tuple(sh_i if i == j else 1 for j in range(nd)) + return ind + + +def extract(condition, arr): + """extract(condition, arr) + + Returns the elements of an array that satisfies the condition. + + If ``condition`` is boolean ``dpctl.tensor.extract`` is + equivalent to ``arr[condition]``. + + Note that ``dpctl.tensor.place`` does the opposite of + ``dpctl.tensor.extract``. + + Args: + conditions (usm_ndarray): + An array whose non-zero or ``True`` entries indicate the element + of ``arr`` to extract. + + arr (usm_ndarray): + Input array of the same size as ``condition``. + + Returns: + usm_ndarray: + Rank 1 array of values from ``arr`` where ``condition`` is + ``True``. + """ + if not isinstance(condition, dpt.usm_ndarray): + raise TypeError( + "Expecting dpnp.tensor.usm_ndarray type, " f"got {type(condition)}" + ) + if not isinstance(arr, dpt.usm_ndarray): + raise TypeError( + "Expecting dpnp.tensor.usm_ndarray type, " f"got {type(arr)}" + ) + exec_q = dpt.get_execution_queue( + ( + condition.sycl_queue, + arr.sycl_queue, + ) + ) + if exec_q is None: + raise dpt.ExecutionPlacementError + if condition.shape != arr.shape: + raise ValueError("Arrays are not of the same size") + return _extract_impl(arr, condition) + + +def nonzero(arr): + """nonzero(arr) + + Return the indices of non-zero elements. + + Returns a tuple of usm_ndarrays, one for each dimension + of ``arr``, containing the indices of the non-zero elements + in that dimension. The values of ``arr`` are always tested in + row-major, C-style order. + + Args: + arr (usm_ndarray): + Input array, which has non-zero array rank. + + Returns: + Tuple[usm_ndarray, ...]: + Indices of non-zero array elements. + """ + if not isinstance(arr, dpt.usm_ndarray): + raise TypeError( + "Expecting dpnp.tensor.usm_ndarray type, " f"got {type(arr)}" + ) + if arr.ndim == 0: + raise ValueError("Array of positive rank is expected") + return _nonzero_impl(arr) + + +def place(arr, mask, vals): + """place(arr, mask, vals) + + Change elements of an array based on conditional and input values. + + If ``mask`` is boolean ``dpctl.tensor.place`` is + equivalent to ``arr[condition] = vals``. + + Args: + arr (usm_ndarray): + Array to put data into. + mask (usm_ndarray): + Boolean mask array. Must have the same size as ``arr``. + vals (usm_ndarray, sequence): + Values to put into ``arr``. Only the first N elements are + used, where N is the number of True values in ``mask``. If + ``vals`` is smaller than N, it will be repeated, and if + elements of ``arr`` are to be masked, this sequence must be + non-empty. Array ``vals`` must be one dimensional. + """ + if not isinstance(arr, dpt.usm_ndarray): + raise TypeError( + "Expecting dpnp.tensor.usm_ndarray type, " f"got {type(arr)}" + ) + if not isinstance(mask, dpt.usm_ndarray): + raise TypeError( + "Expecting dpnp.tensor.usm_ndarray type, " f"got {type(mask)}" + ) + if not isinstance(vals, dpt.usm_ndarray): + raise TypeError( + "Expecting dpnp.tensor.usm_ndarray type, " f"got {type(vals)}" + ) + exec_q = dpt.get_execution_queue( + ( + arr.sycl_queue, + mask.sycl_queue, + vals.sycl_queue, + ) + ) + if exec_q is None: + raise dpt.ExecutionPlacementError + if arr.shape != mask.shape or vals.ndim != 1: + raise ValueError("Array sizes are not as required") + cumsum = dpt.empty(mask.size, dtype="i8", sycl_queue=exec_q) + _manager = SequentialOrderManager[exec_q] + deps_ev = _manager.submitted_events + nz_count = ti.mask_positions( + mask, cumsum, sycl_queue=exec_q, depends=deps_ev + ) + if nz_count == 0: + return + if vals.size == 0: + raise ValueError("Cannot insert from an empty array!") + if vals.dtype == arr.dtype: + rhs = vals + else: + rhs = dpt.astype(vals, arr.dtype) + hev, pl_ev = ti._place( + dst=arr, + cumsum=cumsum, + axis_start=0, + axis_end=mask.ndim, + rhs=rhs, + sycl_queue=exec_q, + ) + _manager.add_event_pair(hev, pl_ev) + + +def put(x, indices, vals, /, *, axis=None, mode="wrap"): + """put(x, indices, vals, axis=None, mode="wrap") + + Puts values into an array along a given axis at given indices. + + Args: + x (usm_ndarray): + The array the values will be put into. + indices (usm_ndarray): + One-dimensional array of indices. + vals (usm_ndarray): + Array of values to be put into ``x``. + Must be broadcastable to the result shape + ``x.shape[:axis] + indices.shape + x.shape[axis+1:]``. + axis (int, optional): + The axis along which the values will be placed. + If ``x`` is one-dimensional, this argument is optional. + Default: ``None``. + mode (str, optional): + How out-of-bounds indices will be handled. Possible values + are: + + - ``"wrap"``: clamps indices to (``-n <= i < n``), then wraps + negative indices. + - ``"clip"``: clips indices to (``0 <= i < n``). + + Default: ``"wrap"``. + + .. note:: + + If input array ``indices`` contains duplicates, a race condition + occurs, and the value written into corresponding positions in ``x`` + may vary from run to run. Preserving sequential semantics in handing + the duplicates to achieve deterministic behavior requires additional + work, e.g. + + :Example: + + .. code-block:: python + + from dpctl import tensor as dpt + + def put_vec_duplicates(vec, ind, vals): + "Put values into vec, handling possible duplicates in ind" + assert vec.ndim, ind.ndim, vals.ndim == 1, 1, 1 + + # find positions of last occurrences of each + # unique index + ind_flipped = dpt.flip(ind) + ind_uniq = dpt.unique_all(ind_flipped).indices + has_dups = len(ind) != len(ind_uniq) + + if has_dups: + ind_uniq = dpt.subtract(vec.size - 1, ind_uniq) + ind = dpt.take(ind, ind_uniq) + vals = dpt.take(vals, ind_uniq) + + dpt.put(vec, ind, vals) + + n = 512 + ind = dpt.concat((dpt.arange(n), dpt.arange(n, -1, step=-1))) + x = dpt.zeros(ind.size, dtype="int32") + vals = dpt.arange(ind.size, dtype=x.dtype) + + # Values corresponding to last positions of + # duplicate indices are written into the vector x + put_vec_duplicates(x, ind, vals) + + parts = (vals[-1:-n-2:-1], dpt.zeros(n, dtype=x.dtype)) + expected = dpt.concat(parts) + assert dpt.all(x == expected) + """ + if not isinstance(x, dpt.usm_ndarray): + raise TypeError( + "Expected instance of `dpt.usm_ndarray`, got `{}`.".format(type(x)) + ) + if not isinstance(indices, dpt.usm_ndarray): + raise TypeError( + "`indices` expected `dpt.usm_ndarray`, got `{}`.".format( + type(indices) + ) + ) + if isinstance(vals, dpt.usm_ndarray): + queues_ = [x.sycl_queue, indices.sycl_queue, vals.sycl_queue] + usm_types_ = [x.usm_type, indices.usm_type, vals.usm_type] + else: + queues_ = [x.sycl_queue, indices.sycl_queue] + usm_types_ = [x.usm_type, indices.usm_type] + if indices.ndim != 1: + raise ValueError( + "`indices` expected a 1D array, got `{}`".format(indices.ndim) + ) + if indices.dtype.kind not in "ui": + raise IndexError( + "`indices` expected integer data type, got `{}`".format( + indices.dtype + ) + ) + exec_q = dpt.get_execution_queue(queues_) + if exec_q is None: + raise dpt.ExecutionPlacementError + vals_usm_type = dpt.get_coerced_usm_type(usm_types_) + + mode = _get_indexing_mode(mode) + + x_ndim = x.ndim + if axis is None: + if x_ndim > 1: + raise ValueError( + "`axis` cannot be `None` for array of dimension `{}`".format( + x_ndim + ) + ) + axis = 0 + + if x_ndim > 0: + axis = normalize_axis_index(operator.index(axis), x_ndim) + x_sh = x.shape + if x_sh[axis] == 0 and indices.size != 0: + raise IndexError("cannot take non-empty indices from an empty axis") + val_shape = x.shape[:axis] + indices.shape + x.shape[axis + 1 :] + else: + if axis != 0: + raise ValueError("`axis` must be 0 for an array of dimension 0.") + val_shape = indices.shape + + if not isinstance(vals, dpt.usm_ndarray): + vals = dpt.asarray( + vals, dtype=x.dtype, usm_type=vals_usm_type, sycl_queue=exec_q + ) + # choose to throw here for consistency with `place` + if vals.size == 0: + raise ValueError( + "cannot put into non-empty indices along an empty axis" + ) + if vals.dtype == x.dtype: + rhs = vals + else: + rhs = dpt.astype(vals, x.dtype) + rhs = dpt.broadcast_to(rhs, val_shape) + + _manager = SequentialOrderManager[exec_q] + deps_ev = _manager.submitted_events + hev, put_ev = ti._put( + x, (indices,), rhs, axis, mode, sycl_queue=exec_q, depends=deps_ev + ) + _manager.add_event_pair(hev, put_ev) + + +def put_along_axis(x, indices, vals, /, *, axis=-1, mode="wrap"): + """ + Puts elements into an array at the one-dimensional indices specified by + ``indices`` along a provided ``axis``. + + Args: + x (usm_ndarray): + input array. Must be compatible with ``indices``, except for the + axis (dimension) specified by ``axis``. + indices (usm_ndarray): + array indices. Must have the same rank (i.e., number of dimensions) + as ``x``. + vals (usm_ndarray): + Array of values to be put into ``x``. + Must be broadcastable to the shape of ``indices``. + axis: int + axis along which to select values. If ``axis`` is negative, the + function determines the axis along which to select values by + counting from the last dimension. Default: ``-1``. + mode (str, optional): + How out-of-bounds indices will be handled. Possible values + are: + + - ``"wrap"``: clamps indices to (``-n <= i < n``), then wraps + negative indices. + - ``"clip"``: clips indices to (``0 <= i < n``). + + Default: ``"wrap"``. + + .. note:: + + If input array ``indices`` contains duplicates, a race condition + occurs, and the value written into corresponding positions in ``x`` + may vary from run to run. Preserving sequential semantics in handing + the duplicates to achieve deterministic behavior requires additional + work. + """ + if not isinstance(x, dpt.usm_ndarray): + raise TypeError(f"Expected dpnp.tensor.usm_ndarray, got {type(x)}") + if not isinstance(indices, dpt.usm_ndarray): + raise TypeError( + f"Expected dpnp.tensor.usm_ndarray, got {type(indices)}" + ) + x_nd = x.ndim + if x_nd != indices.ndim: + raise ValueError( + "Number of dimensions in the first and the second " + "argument arrays must be equal" + ) + pp = normalize_axis_index(operator.index(axis), x_nd) + if isinstance(vals, dpt.usm_ndarray): + queues_ = [x.sycl_queue, indices.sycl_queue, vals.sycl_queue] + usm_types_ = [x.usm_type, indices.usm_type, vals.usm_type] + else: + queues_ = [x.sycl_queue, indices.sycl_queue] + usm_types_ = [x.usm_type, indices.usm_type] + exec_q = dpt.get_execution_queue(queues_) + if exec_q is None: + raise dpt.ExecutionPlacementError( + "Execution placement can not be unambiguously inferred " + "from input arguments. " + ) + out_usm_type = dpt.get_coerced_usm_type(usm_types_) + mode_i = _get_indexing_mode(mode) + indexes_dt = ( + dpt.uint64 + if indices.dtype == dpt.uint64 + else ti.default_device_index_type(exec_q.sycl_device) + ) + _ind = tuple( + ( + indices + if i == pp + else _range(x.shape[i], i, x_nd, exec_q, out_usm_type, indexes_dt) + ) + for i in range(x_nd) + ) + return _put_multi_index(x, _ind, 0, vals, mode=mode_i) + + +def take(x, indices, /, *, axis=None, out=None, mode="wrap"): + """take(x, indices, axis=None, out=None, mode="wrap") + + Takes elements from an array along a given axis at given indices. + + Args: + x (usm_ndarray): + The array that elements will be taken from. + indices (usm_ndarray): + One-dimensional array of indices. + axis (int, optional): + The axis along which the values will be selected. + If ``x`` is one-dimensional, this argument is optional. + Default: ``None``. + out (Optional[usm_ndarray]): + Output array to populate. Array must have the correct + shape and the expected data type. + mode (str, optional): + How out-of-bounds indices will be handled. Possible values + are: + + - ``"wrap"``: clamps indices to (``-n <= i < n``), then wraps + negative indices. + - ``"clip"``: clips indices to (``0 <= i < n``). + + Default: ``"wrap"``. + + Returns: + usm_ndarray: + Array with shape + ``x.shape[:axis] + indices.shape + x.shape[axis + 1:]`` + filled with elements from ``x``. + """ + if not isinstance(x, dpt.usm_ndarray): + raise TypeError( + "Expected instance of `dpt.usm_ndarray`, got `{}`.".format(type(x)) + ) + + if not isinstance(indices, dpt.usm_ndarray): + raise TypeError( + "`indices` expected `dpt.usm_ndarray`, got `{}`.".format( + type(indices) + ) + ) + if indices.dtype.kind not in "ui": + raise IndexError( + "`indices` expected integer data type, got `{}`".format( + indices.dtype + ) + ) + if indices.ndim != 1: + raise ValueError( + "`indices` expected a 1D array, got `{}`".format(indices.ndim) + ) + exec_q = dpt.get_execution_queue([x.sycl_queue, indices.sycl_queue]) + if exec_q is None: + raise dpt.ExecutionPlacementError + res_usm_type = dpt.get_coerced_usm_type([x.usm_type, indices.usm_type]) + + mode = _get_indexing_mode(mode) + + x_ndim = x.ndim + if axis is None: + if x_ndim > 1: + raise ValueError( + "`axis` cannot be `None` for array of dimension `{}`".format( + x_ndim + ) + ) + axis = 0 + + if x_ndim > 0: + axis = normalize_axis_index(operator.index(axis), x_ndim) + x_sh = x.shape + if x_sh[axis] == 0 and indices.size != 0: + raise IndexError("cannot take non-empty indices from an empty axis") + res_shape = x.shape[:axis] + indices.shape + x.shape[axis + 1 :] + else: + if axis != 0: + raise ValueError("`axis` must be 0 for an array of dimension 0.") + res_shape = indices.shape + + dt = x.dtype + + orig_out = out + if out is not None: + if not isinstance(out, dpt.usm_ndarray): + raise TypeError( + f"output array must be of usm_ndarray type, got {type(out)}" + ) + if not out.flags.writable: + raise ValueError("provided `out` array is read-only") + + if out.shape != res_shape: + raise ValueError( + "The shape of input and output arrays are inconsistent. " + f"Expected output shape is {res_shape}, got {out.shape}" + ) + if dt != out.dtype: + raise ValueError( + f"Output array of type {dt} is needed, got {out.dtype}" + ) + if dpt.get_execution_queue((exec_q, out.sycl_queue)) is None: + raise dpt.ExecutionPlacementError( + "Input and output allocation queues are not compatible" + ) + if ti._array_overlap(x, out): + out = dpt.empty_like(out) + else: + out = dpt.empty( + res_shape, dtype=dt, usm_type=res_usm_type, sycl_queue=exec_q + ) + + _manager = SequentialOrderManager[exec_q] + deps_ev = _manager.submitted_events + hev, take_ev = ti._take( + x, (indices,), out, axis, mode, sycl_queue=exec_q, depends=deps_ev + ) + _manager.add_event_pair(hev, take_ev) + + if not (orig_out is None or out is orig_out): + # Copy the out data from temporary buffer to original memory + ht_e_cpy, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=out, dst=orig_out, sycl_queue=exec_q, depends=[take_ev] + ) + _manager.add_event_pair(ht_e_cpy, cpy_ev) + out = orig_out + + return out + + +def take_along_axis(x, indices, /, *, axis=-1, mode="wrap"): + """ + Returns elements from an array at the one-dimensional indices specified + by ``indices`` along a provided ``axis``. + + Args: + x (usm_ndarray): + input array. Must be compatible with ``indices``, except for the + axis (dimension) specified by ``axis``. + indices (usm_ndarray): + array indices. Must have the same rank (i.e., number of dimensions) + as ``x``. + axis: int + axis along which to select values. If ``axis`` is negative, the + function determines the axis along which to select values by + counting from the last dimension. Default: ``-1``. + mode (str, optional): + How out-of-bounds indices will be handled. Possible values + are: + + - ``"wrap"``: clamps indices to (``-n <= i < n``), then wraps + negative indices. + - ``"clip"``: clips indices to (``0 <= i < n``). + + Default: ``"wrap"``. + + Returns: + usm_ndarray: + an array having the same data type as ``x``. The returned array has + the same rank (i.e., number of dimensions) as ``x`` and a shape + determined according to broadcasting rules, except for the axis + (dimension) specified by ``axis`` whose size must equal the size + of the corresponding axis (dimension) in ``indices``. + + Note: + Treatment of the out-of-bound indices in ``indices`` array is controlled + by the value of ``mode`` keyword. + """ + if not isinstance(x, dpt.usm_ndarray): + raise TypeError(f"Expected dpnp.tensor.usm_ndarray, got {type(x)}") + if not isinstance(indices, dpt.usm_ndarray): + raise TypeError( + f"Expected dpnp.tensor.usm_ndarray, got {type(indices)}" + ) + x_nd = x.ndim + if x_nd != indices.ndim: + raise ValueError( + "Number of dimensions in the first and the second " + "argument arrays must be equal" + ) + pp = normalize_axis_index(operator.index(axis), x_nd) + out_usm_type = dpt.get_coerced_usm_type((x.usm_type, indices.usm_type)) + exec_q = dpt.get_execution_queue((x.sycl_queue, indices.sycl_queue)) + if exec_q is None: + raise dpt.ExecutionPlacementError( + "Execution placement can not be unambiguously inferred " + "from input arguments. " + ) + mode_i = _get_indexing_mode(mode) + indexes_dt = ( + dpt.uint64 + if indices.dtype == dpt.uint64 + else ti.default_device_index_type(exec_q.sycl_device) + ) + _ind = tuple( + ( + indices + if i == pp + else _range(x.shape[i], i, x_nd, exec_q, out_usm_type, indexes_dt) + ) + for i in range(x_nd) + ) + return _take_multi_index(x, _ind, 0, mode=mode_i) diff --git a/dpnp/tensor/_linear_algebra_functions.py b/dpnp/tensor/_linear_algebra_functions.py new file mode 100644 index 000000000000..dcaf99b4423c --- /dev/null +++ b/dpnp/tensor/_linear_algebra_functions.py @@ -0,0 +1,1015 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import operator + +from dpctl.utils import SequentialOrderManager + +import dpnp.tensor as dpt +import dpnp.tensor._tensor_elementwise_impl as tei +import dpnp.tensor._tensor_impl as ti +import dpnp.tensor._tensor_linalg_impl as tli + +from ._copy_utils import _empty_like_orderK, _empty_like_pair_orderK +from ._manipulation_functions import _broadcast_shape_impl +from ._numpy_helper import normalize_axis_index, normalize_axis_tuple +from ._type_utils import ( + _acceptance_fn_default_binary, + _find_buf_dtype2, + _to_device_supported_dtype, +) + + +def matrix_transpose(x): + r"""matrix_transpose(x) + + Transposes the innermost two dimensions of `x`, where `x` is a + 2-dimensional matrix or a stack of 2-dimensional matrices. + + To convert from a 1-dimensional array to a 2-dimensional column + vector, use x[:, dpt.newaxis]. + + Args: + x (usm_ndarray): + Input array with shape (..., m, n). + + Returns: + usm_ndarray: + Array with shape (..., n, m). + """ + + if not isinstance(x, dpt.usm_ndarray): + raise TypeError( + "Expected instance of `dpt.usm_ndarray`, got `{}`.".format(type(x)) + ) + if x.ndim < 2: + raise ValueError( + "dpnp.tensor.matrix_transpose requires array to have" + "at least 2 dimensions" + ) + + return x.mT + + +def tensordot(x1, x2, axes=2): + r"""tensordot(x1, x2, axes=2) + + Returns a tensor contraction of `x1` and `x2` over specific axes. + + Args: + x1 (usm_ndarray): + first input array, expected to have numeric data type. + x2 (usm_ndarray): + second input array, expected to have numeric data type. + Corresponding contracted axes of `x1` and `x2` must be equal. + axes (Union[int, Tuple[Sequence[int], Sequence[int]]): + number of axes to contract or explicit sequences of axes for + `x1` and `x2`, respectively. If `axes` is an integer equal to `N`, + then the contraction is performed over last `N` axes of `x1` and + the first `N` axis of `x2` in order. The size of each corresponding + axis must match and must be non-negative. + + * if `N` equals `0`, the result is the tensor outer product + * if `N` equals `1`, the result is the tensor dot product + * if `N` equals `2`, the result is the tensor double + contraction (default). + + If `axes` is a tuple of two sequences `(x1_axes, x2_axes)`, the + first sequence applies to `x1` and the second sequence applies + to `x2`. Both sequences must have equal length, and each axis + `x1_axes[i]` for `x1` must have the same size as the respective + axis `x2_axes[i]` for `x2`. Each sequence must consist of unique + integers that specify valid axes for each respective array. + For example, if `x1` has rank `N`, a valid axis must reside on the + half-open interval `[-N, N)`. + Returns: + usm_ndarray: + an array containing the tensor contraction whose shape consists of + the non-contracted axes of the first array `x1`, followed by the + non-contracted axes of the second array `x2`. The returned array + must have a data type determined by Type Promotion Rules. + """ + if not isinstance(x1, dpt.usm_ndarray): + raise TypeError(f"Expected dpnp.tensor.usm_ndarray, got {type(x1)}") + if not isinstance(x2, dpt.usm_ndarray): + raise TypeError(f"Expected dpnp.tensor.usm_ndarray, got {type(x2)}") + q1, x1_usm_type = x1.sycl_queue, x1.usm_type + q2, x2_usm_type = x2.sycl_queue, x2.usm_type + exec_q = dpt.get_execution_queue((q1, q2)) + if exec_q is None: + raise dpt.ExecutionPlacementError( + "Execution placement can not be unambiguously inferred " + "from input arguments." + ) + res_usm_type = dpt.get_coerced_usm_type( + ( + x1_usm_type, + x2_usm_type, + ) + ) + dpt.validate_usm_type(res_usm_type, allow_none=False) + # handle axes and shapes validation + x1_nd = x1.ndim + x2_nd = x2.ndim + x1_shape = x1.shape + x2_shape = x2.shape + if isinstance(axes, int): + if axes < 0: + raise ValueError("`axes` integer is expected to be non-negative") + n_axes1 = axes + n_axes2 = axes + axes1 = normalize_axis_tuple(tuple(range(-axes, 0)), x1_nd) + axes2 = tuple(range(0, axes)) + elif isinstance(axes, tuple): + if len(axes) != 2: + raise ValueError( + "`axes` tuple is expected to contain two sequences" + ) + axes1 = tuple(axes[0]) + axes2 = tuple(axes[1]) + n_axes1 = len(axes1) + n_axes2 = len(axes2) + else: + raise TypeError("`axes` must be an integer or a tuple of sequences") + if n_axes1 != n_axes2: + raise ValueError( + "number of axes contracted must be the same for each array" + ) + if n_axes1 == 0: + arr1 = x1[..., dpt.newaxis] + arr2 = x2[dpt.newaxis, ...] + n_axes1 = 1 + n_axes2 = 1 + else: + same_shapes = True + for i in range(n_axes1): + axis1 = axes1[i] + axis2 = axes2[i] + same_shapes = same_shapes and (x1_shape[axis1] == x2_shape[axis2]) + if not same_shapes: + raise ValueError("shape mismatch in contracted `tensordot` axes") + axes1 = normalize_axis_tuple(axes1, x1_nd) + axes2 = normalize_axis_tuple(axes2, x2_nd) + perm1 = [i for i in range(x1_nd) if i not in axes1] + list(axes1) + perm2 = list(axes2) + [i for i in range(x2_nd) if i not in axes2] + arr1 = dpt.permute_dims(x1, perm1) + arr2 = dpt.permute_dims(x2, perm2) + arr1_outer_nd = arr1.ndim - n_axes1 + arr2_outer_nd = arr2.ndim - n_axes2 + res_shape = arr1.shape[:arr1_outer_nd] + arr2.shape[n_axes2:] + # dtype validation + sycl_dev = exec_q.sycl_device + x1_dtype = x1.dtype + x2_dtype = x2.dtype + buf1_dt, buf2_dt, res_dt = _find_buf_dtype2( + x1_dtype, + x2_dtype, + tli._dot_result_type, + sycl_dev, + acceptance_fn=_acceptance_fn_default_binary, + ) + if res_dt is None: + raise TypeError( + "function 'tensordot' does not support input types " + f"({x1_dtype}, {x2_dtype}), " + "and the inputs could not be safely coerced to any " + "supported types according to the casting rule ''safe''." + ) + + _manager = SequentialOrderManager[exec_q] + if buf1_dt is None and buf2_dt is None: + out = dpt.empty( + res_shape, + dtype=res_dt, + usm_type=res_usm_type, + sycl_queue=exec_q, + order="C", + ) + dep_evs = _manager.submitted_events + ht_dot_ev, dot_ev = tli._dot( + x1=arr1, + x2=arr2, + batch_dims=0, + x1_outer_dims=arr1_outer_nd, + x2_outer_dims=arr2_outer_nd, + inner_dims=n_axes1, + dst=out, + sycl_queue=exec_q, + depends=dep_evs, + ) + _manager.add_event_pair(ht_dot_ev, dot_ev) + + return out + + elif buf1_dt is None: + buf2 = _empty_like_orderK(arr2, buf2_dt) + + dep_evs = _manager.submitted_events + ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=arr2, dst=buf2, sycl_queue=exec_q, depends=dep_evs + ) + _manager.add_event_pair(ht_copy_ev, copy_ev) + out = dpt.empty( + res_shape, + dtype=res_dt, + usm_type=res_usm_type, + sycl_queue=exec_q, + order="C", + ) + ht_dot_ev, dot_ev = tli._dot( + x1=arr1, + x2=buf2, + batch_dims=0, + x1_outer_dims=arr1_outer_nd, + x2_outer_dims=arr2_outer_nd, + inner_dims=n_axes1, + dst=out, + sycl_queue=exec_q, + depends=[copy_ev], + ) + _manager.add_event_pair(ht_dot_ev, dot_ev) + + return out + + elif buf2_dt is None: + buf1 = _empty_like_orderK(arr1, buf1_dt) + dep_evs = _manager.submitted_events + ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=arr1, dst=buf1, sycl_queue=exec_q, depends=dep_evs + ) + _manager.add_event_pair(ht_copy_ev, copy_ev) + out = dpt.empty( + res_shape, + dtype=res_dt, + usm_type=res_usm_type, + sycl_queue=exec_q, + order="C", + ) + ht_dot_ev, dot_ev = tli._dot( + x1=buf1, + x2=arr2, + batch_dims=0, + x1_outer_dims=arr1_outer_nd, + x2_outer_dims=arr2_outer_nd, + inner_dims=n_axes1, + dst=out, + sycl_queue=exec_q, + depends=[copy_ev], + ) + _manager.add_event_pair(ht_dot_ev, dot_ev) + + return out + + buf1 = _empty_like_orderK(arr1, buf1_dt) + deps_ev = _manager.submitted_events + ht_copy1_ev, copy1_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=arr1, dst=buf1, sycl_queue=exec_q, depends=deps_ev + ) + _manager.add_event_pair(ht_copy1_ev, copy1_ev) + buf2 = _empty_like_orderK(arr2, buf2_dt) + ht_copy2_ev, copy2_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=arr2, dst=buf2, sycl_queue=exec_q, depends=deps_ev + ) + _manager.add_event_pair(ht_copy2_ev, copy2_ev) + out = dpt.empty( + res_shape, + dtype=res_dt, + usm_type=res_usm_type, + sycl_queue=exec_q, + order="C", + ) + ht_, dot_ev = tli._dot( + x1=buf1, + x2=buf2, + batch_dims=0, + x1_outer_dims=arr1_outer_nd, + x2_outer_dims=arr2_outer_nd, + inner_dims=n_axes1, + dst=out, + sycl_queue=exec_q, + depends=[copy1_ev, copy2_ev], + ) + _manager.add_event_pair(ht_, dot_ev) + + return out + + +def vecdot(x1, x2, axis=-1): + r"""vecdot(x1, x2, axis=-1) + + Computes the (vector) dot product of two arrays. + + Args: + x1 (usm_ndarray): + first input array. + x2 (usm_ndarray): + second input array. Input arrays must have compatible + shapes along non-contract axes according to broadcasting + rules, and must have the same size along the contracted + axis. Input arrays should be of numeric type. + axis (Optional[int]): + axis over which to compute the dot product. The axis must + be an integer on the interval `[-N, -1]`, where `N` is + ``min(x1.ndim, x2.ndim)``. The axis along which dot product + is performed is counted backward from the last axes + (that is, `-1` refers to the last axis). By default, + dot product is computed over the last axis. + Default: `-1`. + + Returns: + usm_ndarray: + if `x1` and `x2` are both one-dimensional arrays, a + zero-dimensional array containing the dot product value + is returned; otherwise, a non-zero-dimensional array containing + the dot products and having rank `N-1`, where `N` is the rank + of the shape of input arrays after broadcasting rules are applied + to non-contracted axes. + """ + if not isinstance(x1, dpt.usm_ndarray): + raise TypeError(f"Expected dpnp.tensor.usm_ndarray, got {type(x1)}") + if not isinstance(x2, dpt.usm_ndarray): + raise TypeError(f"Expected dpnp.tensor.usm_ndarray, got {type(x2)}") + q1, x1_usm_type = x1.sycl_queue, x1.usm_type + q2, x2_usm_type = x2.sycl_queue, x2.usm_type + exec_q = dpt.get_execution_queue((q1, q2)) + if exec_q is None: + raise dpt.ExecutionPlacementError( + "Execution placement can not be unambiguously inferred " + "from input arguments." + ) + res_usm_type = dpt.get_coerced_usm_type( + ( + x1_usm_type, + x2_usm_type, + ) + ) + dpt.validate_usm_type(res_usm_type, allow_none=False) + # axis and shape validation + x1_nd = x1.ndim + x2_nd = x2.ndim + x1_shape = x1.shape + x2_shape = x2.shape + if axis >= 0: + raise ValueError("`axis` must be negative") + axis = operator.index(axis) + x1_axis = normalize_axis_index(axis, x1_nd) + x2_axis = normalize_axis_index(axis, x2_nd) + if x1_shape[x1_axis] != x2_shape[x2_axis]: + raise ValueError( + "given axis must have the same shape for `x1` and `x2`" + ) + if x1_nd > x2_nd: + x2_shape = (1,) * (x1_nd - x2_nd) + x2_shape + elif x2_nd > x1_nd: + x1_shape = (1,) * (x2_nd - x1_nd) + x1_shape + try: + broadcast_sh = _broadcast_shape_impl( + [ + x1_shape, + x2_shape, + ] + ) + except ValueError: + raise ValueError("mismatch in `vecdot` dimensions") + broadcast_nd = len(broadcast_sh) + contracted_axis = normalize_axis_index(axis, broadcast_nd) + res_sh = tuple( + [broadcast_sh[i] for i in range(broadcast_nd) if i != contracted_axis] + ) + # dtype validation + sycl_dev = exec_q.sycl_device + x1_dtype = x1.dtype + x2_dtype = x2.dtype + buf1_dt, buf2_dt, res_dt = _find_buf_dtype2( + x1_dtype, + x2_dtype, + tli._dot_result_type, + sycl_dev, + acceptance_fn=_acceptance_fn_default_binary, + ) + if res_dt is None: + raise TypeError( + "function 'vecdot' does not support input types " + f"({x1_dtype}, {x2_dtype}), " + "and the inputs could not be safely coerced to any " + "supported types according to the casting rule ''safe''." + ) + + _manager = SequentialOrderManager[exec_q] + if buf1_dt is None and buf2_dt is None: + if x1.dtype.kind == "c": + x1_tmp = _empty_like_orderK(x1, x1.dtype) + dep_evs = _manager.submitted_events + ht_conj_ev, conj_ev = tei._conj( + src=x1, dst=x1_tmp, sycl_queue=exec_q, depends=dep_evs + ) + _manager.add_event_pair(ht_conj_ev, conj_ev) + x1 = x1_tmp + if x1.shape != broadcast_sh: + x1 = dpt.broadcast_to(x1, broadcast_sh) + if x2.shape != broadcast_sh: + x2 = dpt.broadcast_to(x2, broadcast_sh) + x1 = dpt.moveaxis(x1, contracted_axis, -1) + x2 = dpt.moveaxis(x2, contracted_axis, -1) + out = dpt.empty( + res_sh, + dtype=res_dt, + usm_type=res_usm_type, + sycl_queue=exec_q, + order="C", + ) + dep_evs = _manager.submitted_events + ht_dot_ev, dot_ev = tli._dot( + x1=x1, + x2=x2, + batch_dims=len(res_sh), + x1_outer_dims=0, + x2_outer_dims=0, + inner_dims=1, + dst=out, + sycl_queue=exec_q, + depends=dep_evs, + ) + _manager.add_event_pair(ht_dot_ev, dot_ev) + return dpt.reshape(out, res_sh) + + elif buf1_dt is None: + if x1.dtype.kind == "c": + x1_tmp = _empty_like_orderK(x1, x1.dtype) + deps_ev = _manager.submitted_events + ht_conj_ev, conj_e = tei._conj( + src=x1, dst=x1_tmp, sycl_queue=exec_q, depends=deps_ev + ) + _manager.add_event_pair(ht_conj_ev, conj_e) + x1 = x1_tmp + buf2 = _empty_like_orderK(x2, buf2_dt) + deps_ev = _manager.submitted_events + ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=x2, dst=buf2, sycl_queue=exec_q, depends=deps_ev + ) + _manager.add_event_pair(ht_copy_ev, copy_ev) + if x1.shape != broadcast_sh: + x1 = dpt.broadcast_to(x1, broadcast_sh) + if buf2.shape != broadcast_sh: + buf2 = dpt.broadcast_to(buf2, broadcast_sh) + x1 = dpt.moveaxis(x1, contracted_axis, -1) + buf2 = dpt.moveaxis(buf2, contracted_axis, -1) + out = dpt.empty( + res_sh, + dtype=res_dt, + usm_type=res_usm_type, + sycl_queue=exec_q, + order="C", + ) + ht_dot_ev, dot_ev = tli._dot( + x1=x1, + x2=buf2, + batch_dims=len(res_sh), + x1_outer_dims=0, + x2_outer_dims=0, + inner_dims=1, + dst=out, + sycl_queue=exec_q, + depends=[copy_ev], + ) + _manager.add_event_pair(ht_dot_ev, dot_ev) + return dpt.reshape(out, res_sh) + + elif buf2_dt is None: + buf1 = _empty_like_orderK(x1, buf1_dt) + deps_ev = _manager.submitted_events + ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=x1, dst=buf1, sycl_queue=exec_q, depends=deps_ev + ) + _manager.add_event_pair(ht_copy_ev, copy_ev) + if buf1.dtype.kind == "c": + ht_conj_ev, conj_ev = tei._conj( + src=buf1, dst=buf1, sycl_queue=exec_q, depends=[copy_ev] + ) + _manager.add_event_pair(ht_conj_ev, conj_ev) + if buf1.shape != broadcast_sh: + buf1 = dpt.broadcast_to(buf1, broadcast_sh) + if x2.shape != broadcast_sh: + x2 = dpt.broadcast_to(x2, broadcast_sh) + buf1 = dpt.moveaxis(buf1, contracted_axis, -1) + x2 = dpt.moveaxis(x2, contracted_axis, -1) + out = dpt.empty( + res_sh, + dtype=res_dt, + usm_type=res_usm_type, + sycl_queue=exec_q, + order="C", + ) + deps_ev = _manager.submitted_events + ht_dot_ev, dot_ev = tli._dot( + x1=buf1, + x2=x2, + batch_dims=len(res_sh), + x1_outer_dims=0, + x2_outer_dims=0, + inner_dims=1, + dst=out, + sycl_queue=exec_q, + depends=deps_ev, + ) + _manager.add_event_pair(ht_dot_ev, dot_ev) + return dpt.reshape(out, res_sh) + + buf1 = _empty_like_orderK(x1, buf1_dt) + deps_ev = _manager.submitted_events + ht_copy1_ev, copy1_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=x1, dst=buf1, sycl_queue=exec_q, depends=deps_ev + ) + _manager.add_event_pair(ht_copy1_ev, copy1_ev) + if buf1.dtype.kind == "c": + ht_conj_ev, conj_ev = tei._conj( + src=buf1, dst=buf1, sycl_queue=exec_q, depends=[copy1_ev] + ) + _manager.add_event_pair(ht_conj_ev, conj_ev) + buf2 = _empty_like_orderK(x2, buf2_dt) + ht_copy2_ev, copy2_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=x2, dst=buf2, sycl_queue=exec_q, depends=deps_ev + ) + _manager.add_event_pair(ht_copy2_ev, copy2_ev) + if buf1.shape != broadcast_sh: + buf1 = dpt.broadcast_to(buf1, broadcast_sh) + if buf2.shape != broadcast_sh: + buf2 = dpt.broadcast_to(buf2, broadcast_sh) + buf1 = dpt.moveaxis(buf1, contracted_axis, -1) + buf2 = dpt.moveaxis(buf2, contracted_axis, -1) + out = dpt.empty( + res_sh, + dtype=res_dt, + usm_type=res_usm_type, + sycl_queue=exec_q, + order="C", + ) + deps_ev = _manager.submitted_events + ht_dot_ev, dot_ev = tli._dot( + x1=buf1, + x2=buf2, + batch_dims=len(res_sh), + x1_outer_dims=0, + x2_outer_dims=0, + inner_dims=1, + dst=out, + sycl_queue=exec_q, + depends=deps_ev, + ) + _manager.add_event_pair(ht_dot_ev, dot_ev) + return out + + +def matmul(x1, x2, out=None, dtype=None, order="K"): + r"""matmul(x1, x2, out=None, order="K") + + Computes the matrix product. Implements the same semantics + as the built-in operator `@`. + + Args: + x1 (usm_ndarray): + first input array. Expected to have numeric data type, and + at least one dimension. If `x1` is one-dimensional having + shape `(M,)`, and `x2` has more than one dimension, `x1` is + effectively treated as a two-dimensional array with shape `(1, M)`, + although the prepended dimension is removed from the output array. + If `x1` has shape `(..., M, K)`, the innermost two dimensions form + matrices on which to perform matrix multiplication. + x2 (usm_ndarray): + second input array. Expected to have numeric data type, and + at least one dimension. If `x2` is one-dimensional having + shape `(N,)`, and `x1` has more than one dimension, `x2` is + effectively treated as a two-dimensional array with shape `(N, 1)`, + although the appended dimension is removed from the output array. + If `x2` has shape `(..., K, N)`, the innermost two dimensions form + matrices on which to perform matrix multiplication. + out (Optional[usm_ndarray]): + the array into which the result of the matrix product is written. + The data type of `out` must match the expected data type of the + result or (if provided) `dtype`. + If `None` then a new array is returned. Default: `None`. + dtype (Optional[dtype]): + data type of the returned array. If `None`, the data type of the + returned array is determined by the Type Promotion Rules. + Default: `None`. + order (["K", "C", "F", "A"]): + memory layout of the output array, if `out` is `None`, otherwise + the `order` parameter value is not used. Default: `K`. + Returns: + usm_ndarray: + * if both `x1` and `x2` are one-dimensional arrays with shape + `(N,)`, returned array is a zero-dimensional array containing + inner product as its only element. + * if `x1` is two-dimensional array with shape `(M, K)` and `x2` is + a two-dimensional array with shape `(K, N)`, returned array is a + two-dimensional array with shape `(M, N)` and contains the + conventional matrix product. + * if `x1` is a one-dimensional array with shape `(K,)` and `x2` is + an array with shape `(..., K, N)`, returned array contains the + conventional matrix product and has shape `(..., N)`. + * if `x1` is an array with shape `(..., M, K)` and `x2` is a + one-dimensional array with shape `(K,)`, returned array has shape + `(..., M)` and contains the conventional matrix product. + * if `x1` is a two-dimensional array with shape `(M, K)` and `x2` + is an array with shape `(..., K, N)`, returned array contains + conventional matrix product for each stacked matrix and has shape + `(..., M, N)`. + * if `x1` has shape `(..., M, K)` and `x2` is a two-dimensional + array with shape `(K, N)`, returned array contains conventional + matrix product for each stacked matrix and has shape + `(..., M, N)`. + * if both `x1` and `x2` have more than two dimensions, returned + array contains conventional matrix product for each stacked + matrix and has shape determined by broadcasting rules for + `x1.shape[:-2]` and `x2.shape[:-2]`. + + The data type of the returned array is determined by the Type + Promotion Rules. If either `x1` or `x2` has a complex floating + point type, neither argument is complex conjugated or transposed. + """ + if not isinstance(x1, dpt.usm_ndarray): + raise TypeError(f"Expected dpnp.tensor.usm_ndarray, got {type(x1)}") + if not isinstance(x2, dpt.usm_ndarray): + raise TypeError(f"Expected dpnp.tensor.usm_ndarray, got {type(x2)}") + if order not in ["K", "C", "F", "A"]: + order = "K" + q1, x1_usm_type = x1.sycl_queue, x1.usm_type + q2, x2_usm_type = x2.sycl_queue, x2.usm_type + exec_q = dpt.get_execution_queue((q1, q2)) + if exec_q is None: + raise dpt.ExecutionPlacementError( + "Execution placement can not be unambiguously inferred " + "from input arguments." + ) + res_usm_type = dpt.get_coerced_usm_type( + ( + x1_usm_type, + x2_usm_type, + ) + ) + dpt.validate_usm_type(res_usm_type, allow_none=False) + + x1_nd = x1.ndim + x2_nd = x2.ndim + if x1_nd == 0 or x2_nd == 0: + raise ValueError("one or more operands to `matmul` is 0 dimensional") + x1_shape = x1.shape + x2_shape = x2.shape + appended_axes = [] + if x1_nd == 1: + x1 = x1[dpt.newaxis, :] + x1_shape = x1.shape + appended_axes.append(-2) + if x2_nd == 1: + x2 = x2[:, dpt.newaxis] + x2_shape = x2.shape + appended_axes.append(-1) + if x1_shape[-1] != x2_shape[-2]: + raise ValueError("mismatch in `matmul` inner dimension") + x1_outer_sh = x1_shape[:-2] + x2_outer_sh = x2_shape[:-2] + try: + res_outer_sh = _broadcast_shape_impl( + [ + x1_outer_sh, + x2_outer_sh, + ] + ) + except ValueError: + raise ValueError("mismatch in `matmul` batching dimensions") + x1_broadcast_shape = res_outer_sh + x1_shape[-2:] + x2_broadcast_shape = res_outer_sh + x2_shape[-2:] + res_shape = res_outer_sh + x1_shape[-2:-1] + x2_shape[-1:] + + sycl_dev = exec_q.sycl_device + x1_dtype = x1.dtype + x2_dtype = x2.dtype + if dtype is None: + buf1_dt, buf2_dt, res_dt = _find_buf_dtype2( + x1_dtype, + x2_dtype, + tli._dot_result_type, + sycl_dev, + acceptance_fn=_acceptance_fn_default_binary, + ) + if res_dt is None: + raise ValueError( + "function 'matmul' does not support input types " + f"({x1_dtype}, {x2_dtype}), " + "and the inputs could not be safely coerced to any " + "supported types according to the casting rule ''safe''." + ) + else: + res_dt = dpt.dtype(dtype) + res_dt = _to_device_supported_dtype(res_dt, sycl_dev) + buf1_dt, buf2_dt = None, None + if x1_dtype != res_dt: + if dpt.can_cast(x1_dtype, res_dt, casting="same_kind"): + buf1_dt = res_dt + else: + raise ValueError( + r"`matmul` input `x1` cannot be cast from " + f"{x1_dtype} to " + f"requested type {res_dt} according to the casting rule " + "''same_kind''." + ) + if x2_dtype != res_dt: + if dpt.can_cast(x2_dtype, res_dt, casting="same_kind"): + buf2_dt = res_dt + else: + raise ValueError( + r"`matmul` input `x2` cannot be cast from " + f"{x2_dtype} to " + f"requested type {res_dt} according to the casting rule " + "''same_kind''." + ) + + orig_out = out + if out is not None: + if not isinstance(out, dpt.usm_ndarray): + raise TypeError( + f"output array must be of usm_ndarray type, got {type(out)}" + ) + + if not out.flags.writable: + raise ValueError("provided `out` array is read-only") + + final_res_shape = tuple( + res_shape[i] + for i in range(-len(res_shape), 0) + if i not in appended_axes + ) + if out.shape != final_res_shape: + raise ValueError( + "The shape of input and output arrays are inconsistent. " + f"Expected output shape is {final_res_shape}, got {out.shape}" + ) + + if appended_axes: + out = dpt.expand_dims(out, axis=appended_axes) + orig_out = out + + if res_dt != out.dtype: + raise ValueError( + f"Output array of type {res_dt} is needed, got {out.dtype}" + ) + + if dpt.get_execution_queue((exec_q, out.sycl_queue)) is None: + raise dpt.ExecutionPlacementError( + "Input and output allocation queues are not compatible" + ) + + if ti._array_overlap(x1, out) and buf1_dt is None: + out = dpt.empty_like(out) + + if ti._array_overlap(x2, out) and buf2_dt is None: + # should not reach if out is reallocated + # after being checked against x1 + out = dpt.empty_like(out) + + if order == "A": + order = ( + "F" + if all( + arr.flags.f_contiguous + for arr in ( + x1, + x2, + ) + ) + else "C" + ) + + _manager = SequentialOrderManager[exec_q] + if buf1_dt is None and buf2_dt is None: + if out is None: + if order == "K": + out = _empty_like_pair_orderK( + x1, x2, res_dt, res_shape, res_usm_type, exec_q + ) + else: + out = dpt.empty( + res_shape, + dtype=res_dt, + usm_type=res_usm_type, + sycl_queue=exec_q, + order=order, + ) + if x1.shape != x1_broadcast_shape: + x1 = dpt.broadcast_to(x1, x1_broadcast_shape) + if x2.shape != x2_broadcast_shape: + x2 = dpt.broadcast_to(x2, x2_broadcast_shape) + deps_evs = _manager.submitted_events + ht_dot_ev, dot_ev = tli._dot( + x1=x1, + x2=x2, + batch_dims=len(res_shape[:-2]), + x1_outer_dims=1, + x2_outer_dims=1, + inner_dims=1, + dst=out, + sycl_queue=exec_q, + depends=deps_evs, + ) + _manager.add_event_pair(ht_dot_ev, dot_ev) + if not (orig_out is None or orig_out is out): + # Copy the out data from temporary buffer to original memory + ht_copy_out_ev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=out, + dst=orig_out, + sycl_queue=exec_q, + depends=[dot_ev], + ) + _manager.add_event_pair(ht_copy_out_ev, cpy_ev) + out = orig_out + if appended_axes: + out = dpt.squeeze(out, tuple(appended_axes)) + return out + elif buf1_dt is None: + if order == "K": + buf2 = _empty_like_orderK(x2, buf2_dt) + else: + buf2 = dpt.empty_like(x2, dtype=buf2_dt, order=order) + deps_evs = _manager.submitted_events + ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=x2, dst=buf2, sycl_queue=exec_q, depends=deps_evs + ) + _manager.add_event_pair(ht_copy_ev, copy_ev) + if out is None: + if order == "K": + out = _empty_like_pair_orderK( + x1, buf2, res_dt, res_shape, res_usm_type, exec_q + ) + else: + out = dpt.empty( + res_shape, + dtype=res_dt, + usm_type=res_usm_type, + sycl_queue=exec_q, + order=order, + ) + + if x1.shape != x1_broadcast_shape: + x1 = dpt.broadcast_to(x1, x1_broadcast_shape) + if buf2.shape != x2_broadcast_shape: + buf2 = dpt.broadcast_to(buf2, x2_broadcast_shape) + ht_dot_ev, dot_ev = tli._dot( + x1=x1, + x2=buf2, + batch_dims=len(res_shape[:-2]), + x1_outer_dims=1, + x2_outer_dims=1, + inner_dims=1, + dst=out, + sycl_queue=exec_q, + depends=[copy_ev], + ) + _manager.add_event_pair(ht_dot_ev, dot_ev) + if not (orig_out is None or orig_out is out): + # Copy the out data from temporary buffer to original memory + ht_copy_out_ev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=out, + dst=orig_out, + sycl_queue=exec_q, + depends=[dot_ev], + ) + _manager.add_event_pair(ht_copy_out_ev, cpy_ev) + out = orig_out + if appended_axes: + out = dpt.squeeze(out, tuple(appended_axes)) + return out + + elif buf2_dt is None: + if order == "K": + buf1 = _empty_like_orderK(x1, buf1_dt) + else: + buf1 = dpt.empty_like(x1, dtype=buf1_dt, order=order) + deps_ev = _manager.submitted_events + ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=x1, dst=buf1, sycl_queue=exec_q, depends=deps_ev + ) + _manager.add_event_pair(ht_copy_ev, copy_ev) + if out is None: + if order == "K": + out = _empty_like_pair_orderK( + buf1, x2, res_dt, res_shape, res_usm_type, exec_q + ) + else: + out = dpt.empty( + res_shape, + dtype=res_dt, + usm_type=res_usm_type, + sycl_queue=exec_q, + order=order, + ) + + if buf1.shape != x1_broadcast_shape: + buf1 = dpt.broadcast_to(buf1, x1_broadcast_shape) + if x2.shape != x2_broadcast_shape: + x2 = dpt.broadcast_to(x2, x2_broadcast_shape) + ht_dot_ev, dot_ev = tli._dot( + x1=buf1, + x2=x2, + batch_dims=len(res_shape[:-2]), + x1_outer_dims=1, + x2_outer_dims=1, + inner_dims=1, + dst=out, + sycl_queue=exec_q, + depends=[copy_ev], + ) + _manager.add_event_pair(ht_dot_ev, dot_ev) + if not (orig_out is None or orig_out is out): + # Copy the out data from temporary buffer to original memory + ht_copy_out_ev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=out, + dst=orig_out, + sycl_queue=exec_q, + depends=[dot_ev], + ) + _manager.add_event_pair(ht_copy_out_ev, cpy_ev) + out = orig_out + if appended_axes: + out = dpt.squeeze(out, tuple(appended_axes)) + return out + + if order == "K": + if x1.flags.c_contiguous and x2.flags.c_contiguous: + order = "C" + elif x1.flags.f_contiguous and x2.flags.f_contiguous: + order = "F" + if order == "K": + buf1 = _empty_like_orderK(x1, buf1_dt) + else: + buf1 = dpt.empty_like(x1, dtype=buf1_dt, order=order) + deps_ev = _manager.submitted_events + ht_copy1_ev, copy1_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=x1, dst=buf1, sycl_queue=exec_q, depends=deps_ev + ) + _manager.add_event_pair(ht_copy1_ev, copy1_ev) + if order == "K": + buf2 = _empty_like_orderK(x2, buf2_dt) + else: + buf2 = dpt.empty_like(x2, dtype=buf2_dt, order=order) + ht_copy2_ev, copy2_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=x2, dst=buf2, sycl_queue=exec_q, depends=deps_ev + ) + _manager.add_event_pair(ht_copy2_ev, copy2_ev) + if out is None: + if order == "K": + out = _empty_like_pair_orderK( + buf1, buf2, res_dt, res_shape, res_usm_type, exec_q + ) + else: + out = dpt.empty( + res_shape, + dtype=res_dt, + usm_type=res_usm_type, + sycl_queue=exec_q, + order=order, + ) + + if buf1.shape != x1_broadcast_shape: + buf1 = dpt.broadcast_to(buf1, x1_broadcast_shape) + if buf2.shape != x2_broadcast_shape: + buf2 = dpt.broadcast_to(buf2, x2_broadcast_shape) + ht_, dot_ev = tli._dot( + x1=buf1, + x2=buf2, + batch_dims=len(res_shape[:-2]), + x1_outer_dims=1, + x2_outer_dims=1, + inner_dims=1, + dst=out, + sycl_queue=exec_q, + depends=[copy1_ev, copy2_ev], + ) + _manager.add_event_pair(ht_, dot_ev) + if appended_axes: + out = dpt.squeeze(out, tuple(appended_axes)) + return out diff --git a/dpnp/tensor/_manipulation_functions.py b/dpnp/tensor/_manipulation_functions.py new file mode 100644 index 000000000000..7347f62de115 --- /dev/null +++ b/dpnp/tensor/_manipulation_functions.py @@ -0,0 +1,1094 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import itertools +import operator + +import numpy as np +from dpctl.utils import SequentialOrderManager + +import dpnp.tensor as dpt +import dpnp.tensor._tensor_impl as ti + +from ._numpy_helper import normalize_axis_index, normalize_axis_tuple +from ._type_utils import _supported_dtype, _to_device_supported_dtype + +__doc__ = ( + "Implementation module for array manipulation " + "functions in :module:`dpctl.tensor`" +) + + +def _arrays_validation(arrays, check_ndim=True): + n = len(arrays) + if n == 0: + raise TypeError("Missing 1 required positional argument: 'arrays'.") + + if not isinstance(arrays, (list, tuple)): + raise TypeError(f"Expected tuple or list type, got {type(arrays)}.") + + for X in arrays: + if not isinstance(X, dpt.usm_ndarray): + raise TypeError(f"Expected usm_ndarray type, got {type(X)}.") + + exec_q = dpt.get_execution_queue([X.sycl_queue for X in arrays]) + if exec_q is None: + raise ValueError("All the input arrays must have same sycl queue.") + + res_usm_type = dpt.get_coerced_usm_type([X.usm_type for X in arrays]) + if res_usm_type is None: + raise ValueError("All the input arrays must have usm_type.") + + X0 = arrays[0] + _supported_dtype(Xi.dtype for Xi in arrays) + + res_dtype = X0.dtype + dev = exec_q.sycl_device + for i in range(1, n): + res_dtype = np.promote_types(res_dtype, arrays[i]) + res_dtype = _to_device_supported_dtype(res_dtype, dev) + + if check_ndim: + for i in range(1, n): + if X0.ndim != arrays[i].ndim: + raise ValueError( + "All the input arrays must have same number of dimensions, " + f"but the array at index 0 has {X0.ndim} dimension(s) and " + f"the array at index {i} has {arrays[i].ndim} dimension(s)." + ) + return res_dtype, res_usm_type, exec_q + + +def _broadcast_shapes(*args): + """ + Broadcast the input shapes into a single shape; + returns tuple broadcasted shape. + """ + array_shapes = [array.shape for array in args] + return _broadcast_shape_impl(array_shapes) + + +def _broadcast_shape_impl(shapes): + if len(set(shapes)) == 1: + return shapes[0] + mutable_shapes = False + nds = [len(s) for s in shapes] + biggest = max(nds) + sh_len = len(shapes) + for i in range(sh_len): + diff = biggest - nds[i] + if diff > 0: + ty = type(shapes[i]) + shapes[i] = ty( + itertools.chain(itertools.repeat(1, diff), shapes[i]) + ) + common_shape = [] + for axis in range(biggest): + lengths = [s[axis] for s in shapes] + unique = set(lengths + [1]) + if len(unique) > 2: + raise ValueError( + "Shape mismatch: two or more arrays have " + f"incompatible dimensions on axis ({axis},)" + ) + elif len(unique) == 2: + unique.remove(1) + new_length = unique.pop() + common_shape.append(new_length) + for i in range(sh_len): + if shapes[i][axis] == 1: + if not mutable_shapes: + shapes = [list(s) for s in shapes] + mutable_shapes = True + shapes[i][axis] = new_length + else: + common_shape.append(1) + + return tuple(common_shape) + + +def _broadcast_strides(X_shape, X_strides, res_ndim): + """ + Broadcasts strides to match the given dimensions; + returns tuple type strides. + """ + out_strides = [0] * res_ndim + X_shape_len = len(X_shape) + str_dim = -X_shape_len + for i in range(X_shape_len): + shape_value = X_shape[i] + if not shape_value == 1: + out_strides[str_dim] = X_strides[i] + str_dim += 1 + + return tuple(out_strides) + + +def _check_same_shapes(X0_shape, axis, n, arrays): + for i in range(1, n): + Xi_shape = arrays[i].shape + for j, X0j in enumerate(X0_shape): + if X0j != Xi_shape[j] and j != axis: + raise ValueError( + "All the input array dimensions for the concatenation " + f"axis must match exactly, but along dimension {j}, the " + f"array at index 0 has size {X0j} and the array " + f"at index {i} has size {Xi_shape[j]}." + ) + + +def _concat_axis_None(arrays): + """Implementation of concat(arrays, axis=None).""" + res_dtype, res_usm_type, exec_q = _arrays_validation( + arrays, check_ndim=False + ) + res_shape = 0 + for array in arrays: + res_shape += array.size + res = dpt.empty( + res_shape, dtype=res_dtype, usm_type=res_usm_type, sycl_queue=exec_q + ) + + fill_start = 0 + _manager = SequentialOrderManager[exec_q] + deps = _manager.submitted_events + for array in arrays: + fill_end = fill_start + array.size + if array.flags.c_contiguous: + hev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=dpt.reshape(array, -1), + dst=res[fill_start:fill_end], + sycl_queue=exec_q, + depends=deps, + ) + _manager.add_event_pair(hev, cpy_ev) + else: + src_ = array + # _copy_usm_ndarray_for_reshape requires src and dst to have + # the same data type + if not array.dtype == res_dtype: + src2_ = dpt.empty_like(src_, dtype=res_dtype) + ht_copy_ev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=src_, dst=src2_, sycl_queue=exec_q, depends=deps + ) + _manager.add_event_pair(ht_copy_ev, cpy_ev) + hev, reshape_copy_ev = ti._copy_usm_ndarray_for_reshape( + src=src2_, + dst=res[fill_start:fill_end], + sycl_queue=exec_q, + depends=[cpy_ev], + ) + _manager.add_event_pair(hev, reshape_copy_ev) + else: + hev, cpy_ev = ti._copy_usm_ndarray_for_reshape( + src=src_, + dst=res[fill_start:fill_end], + sycl_queue=exec_q, + depends=deps, + ) + _manager.add_event_pair(hev, cpy_ev) + fill_start = fill_end + + return res + + +def broadcast_arrays(*args): + """broadcast_arrays(*arrays) + + Broadcasts one or more :class:`dpctl.tensor.usm_ndarrays` against + one another. + + Args: + arrays (usm_ndarray): an arbitrary number of arrays to be + broadcasted. + + Returns: + List[usm_ndarray]: + A list of broadcasted arrays. Each array + must have the same shape. Each array must have the same `dtype`, + `device` and `usm_type` attributes as its corresponding input + array. + """ + if len(args) == 0: + raise ValueError("`broadcast_arrays` requires at least one argument") + for X in args: + if not isinstance(X, dpt.usm_ndarray): + raise TypeError(f"Expected usm_ndarray type, got {type(X)}.") + + shape = _broadcast_shapes(*args) + + if all(X.shape == shape for X in args): + return args + + return [broadcast_to(X, shape) for X in args] + + +def broadcast_to(X, /, shape): + """broadcast_to(x, shape) + + Broadcast an array to a new `shape`; returns the broadcasted + :class:`dpctl.tensor.usm_ndarray` as a view. + + Args: + x (usm_ndarray): input array + shape (Tuple[int,...]): array shape. The `shape` must be + compatible with `x` according to broadcasting rules. + + Returns: + usm_ndarray: + An array with the specified `shape`. + The output array is a view of the input array, and + hence has the same data type, USM allocation type and + device attributes. + """ + if not isinstance(X, dpt.usm_ndarray): + raise TypeError(f"Expected usm_ndarray type, got {type(X)}.") + + # Use numpy.broadcast_to to check the validity of the input + # parameter 'shape'. Raise ValueError if 'X' is not compatible + # with 'shape' according to NumPy's broadcasting rules. + new_array = np.broadcast_to( + np.broadcast_to(np.empty(tuple(), dtype="u1"), X.shape), shape + ) + new_sts = _broadcast_strides(X.shape, X.strides, new_array.ndim) + return dpt.usm_ndarray( + shape=new_array.shape, + dtype=X.dtype, + buffer=X, + strides=new_sts, + offset=X._element_offset, + ) + + +def concat(arrays, /, *, axis=0): + """concat(arrays, axis) + + Joins a sequence of arrays along an existing axis. + + Args: + arrays (Union[List[usm_ndarray, Tuple[usm_ndarray,...]]]): + input arrays to join. The arrays must have the same shape, + except in the dimension specified by `axis`. + axis (Optional[int]): axis along which the arrays will be joined. + If `axis` is `None`, arrays must be flattened before + concatenation. If `axis` is negative, it is understood as + being counted from the last dimension. Default: `0`. + + Returns: + usm_ndarray: + An output array containing the concatenated + values. The output array data type is determined by Type + Promotion Rules of array API. + + All input arrays must have the same device attribute. The output array + is allocated on that same device, and data movement operations are + scheduled on a queue underlying the device. The USM allocation type + of the output array is determined by USM allocation type promotion + rules. + """ + if axis is None: + return _concat_axis_None(arrays) + + res_dtype, res_usm_type, exec_q = _arrays_validation(arrays) + n = len(arrays) + X0 = arrays[0] + + axis = normalize_axis_index(axis, X0.ndim) + X0_shape = X0.shape + _check_same_shapes(X0_shape, axis, n, arrays) + + res_shape_axis = 0 + for X in arrays: + res_shape_axis = res_shape_axis + X.shape[axis] + + res_shape = tuple( + X0_shape[i] if i != axis else res_shape_axis for i in range(X0.ndim) + ) + + res = dpt.empty( + res_shape, dtype=res_dtype, usm_type=res_usm_type, sycl_queue=exec_q + ) + + _manager = SequentialOrderManager[exec_q] + deps = _manager.submitted_events + fill_start = 0 + for i in range(n): + fill_end = fill_start + arrays[i].shape[axis] + c_shapes_copy = tuple( + np.s_[fill_start:fill_end] if j == axis else np.s_[:] + for j in range(X0.ndim) + ) + hev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=arrays[i], + dst=res[c_shapes_copy], + sycl_queue=exec_q, + depends=deps, + ) + _manager.add_event_pair(hev, cpy_ev) + fill_start = fill_end + + return res + + +def expand_dims(X, /, *, axis=0): + """expand_dims(x, axis) + + Expands the shape of an array by inserting a new axis (dimension) + of size one at the position specified by axis. + + Args: + x (usm_ndarray): + input array + axis (Union[int, Tuple[int]]): + axis position in the expanded axes (zero-based). If `x` has rank + (i.e, number of dimensions) `N`, a valid `axis` must reside + in the closed-interval `[-N-1, N]`. If provided a negative + `axis`, the `axis` position at which to insert a singleton + dimension is computed as `N + axis + 1`. Hence, if + provided `-1`, the resolved axis position is `N` (i.e., + a singleton dimension must be appended to the input array `x`). + If provided `-N-1`, the resolved axis position is `0` (i.e., a + singleton dimension is prepended to the input array `x`). + + Returns: + usm_ndarray: + Returns a view, if possible, and a copy otherwise with the number + of dimensions increased. + The expanded array has the same data type as the input array `x`. + The expanded array is located on the same device as the input + array, and has the same USM allocation type. + + Raises: + IndexError: if `axis` value is invalid. + """ + if not isinstance(X, dpt.usm_ndarray): + raise TypeError(f"Expected usm_ndarray type, got {type(X)}.") + + if type(axis) not in (tuple, list): + axis = (axis,) + + out_ndim = len(axis) + X.ndim + axis = normalize_axis_tuple(axis, out_ndim) + + shape_it = iter(X.shape) + shape = tuple(1 if ax in axis else next(shape_it) for ax in range(out_ndim)) + + return dpt.reshape(X, shape) + + +def flip(X, /, *, axis=None): + """flip(x, axis) + + Reverses the order of elements in an array `x` along the given `axis`. + The shape of the array is preserved, but the elements are reordered. + + Args: + x (usm_ndarray): input array. + axis (Optional[Union[int, Tuple[int,...]]]): axis (or axes) along + which to flip. + If `axis` is `None`, all input array axes are flipped. + If `axis` is negative, the flipped axis is counted from the + last dimension. If provided more than one axis, only the specified + axes are flipped. Default: `None`. + + Returns: + usm_ndarray: + A view of `x` with the entries of `axis` reversed. + """ + if not isinstance(X, dpt.usm_ndarray): + raise TypeError(f"Expected usm_ndarray type, got {type(X)}.") + X_ndim = X.ndim + if axis is None: + indexer = (np.s_[::-1],) * X_ndim + else: + axis = normalize_axis_tuple(axis, X_ndim) + indexer = tuple( + np.s_[::-1] if i in axis else np.s_[:] for i in range(X.ndim) + ) + return X[indexer] + + +def moveaxis(X, source, destination, /): + """moveaxis(x, source, destination) + + Moves axes of an array to new positions. + + Args: + x (usm_ndarray): input array + + source (int or a sequence of int): + Original positions of the axes to move. + These must be unique. If `x` has rank (i.e., number of + dimensions) `N`, a valid `axis` must be in the + half-open interval `[-N, N)`. + + destination (int or a sequence of int): + Destination positions for each of the original axes. + These must also be unique. If `x` has rank + (i.e., number of dimensions) `N`, a valid `axis` must be + in the half-open interval `[-N, N)`. + + Returns: + usm_ndarray: + Array with moved axes. + The returned array must has the same data type as `x`, + is created on the same device as `x` and has the same + USM allocation type as `x`. + + Raises: + AxisError: if `axis` value is invalid. + ValueError: if `src` and `dst` have not equal number of elements. + """ + if not isinstance(X, dpt.usm_ndarray): + raise TypeError(f"Expected usm_ndarray type, got {type(X)}.") + + source = normalize_axis_tuple(source, X.ndim, "source") + destination = normalize_axis_tuple(destination, X.ndim, "destination") + + if len(source) != len(destination): + raise ValueError( + "`source` and `destination` arguments must have " + "the same number of elements" + ) + + ind = [n for n in range(X.ndim) if n not in source] + + for src, dst in sorted(zip(destination, source)): + ind.insert(src, dst) + + return dpt.permute_dims(X, tuple(ind)) + + +def permute_dims(X, /, axes): + """permute_dims(x, axes) + + Permute the axes (dimensions) of an array; returns the permuted + array as a view. + + Args: + x (usm_ndarray): input array. + axes (Tuple[int, ...]): tuple containing permutation of + `(0,1,...,N-1)` where `N` is the number of axes (dimensions) + of `x`. + Returns: + usm_ndarray: + An array with permuted axes. + The returned array must has the same data type as `x`, + is created on the same device as `x` and has the same USM allocation + type as `x`. + """ + if not isinstance(X, dpt.usm_ndarray): + raise TypeError(f"Expected usm_ndarray type, got {type(X)}.") + axes = normalize_axis_tuple(axes, X.ndim, "axes") + if not X.ndim == len(axes): + raise ValueError( + "The length of the passed axes does not match " + "to the number of usm_ndarray dimensions." + ) + newstrides = tuple(X.strides[i] for i in axes) + newshape = tuple(X.shape[i] for i in axes) + return dpt.usm_ndarray( + shape=newshape, + dtype=X.dtype, + buffer=X, + strides=newstrides, + offset=X._element_offset, + ) + + +def repeat(x, repeats, /, *, axis=None): + """repeat(x, repeats, axis=None) + + Repeat elements of an array on a per-element basis. + + Args: + x (usm_ndarray): input array + + repeats (Union[int, Sequence[int, ...], usm_ndarray]): + The number of repetitions for each element. + + `repeats` must be broadcast-compatible with `N` where `N` is + `prod(x.shape)` if `axis` is `None` and `x.shape[axis]` + otherwise. + + If `repeats` is an array, it must have an integer data type. + Otherwise, `repeats` must be a Python integer or sequence of + Python integers (i.e., a tuple, list, or range). + + axis (Optional[int]): + The axis along which to repeat values. If `axis` is `None`, the + function repeats elements of the flattened array. Default: `None`. + + Returns: + usm_ndarray: + output array with repeated elements. + + If `axis` is `None`, the returned array is one-dimensional, + otherwise, it has the same shape as `x`, except for the axis along + which elements were repeated. + + The returned array will have the same data type as `x`. + The returned array will be located on the same device as `x` and + have the same USM allocation type as `x`. + + Raises: + AxisError: if `axis` value is invalid. + """ + if not isinstance(x, dpt.usm_ndarray): + raise TypeError(f"Expected usm_ndarray type, got {type(x)}.") + + x_ndim = x.ndim + x_shape = x.shape + if axis is not None: + axis = normalize_axis_index(operator.index(axis), x_ndim) + axis_size = x_shape[axis] + else: + axis_size = x.size + + scalar = False + if isinstance(repeats, int): + if repeats < 0: + raise ValueError("`repeats` must be a positive integer") + usm_type = x.usm_type + exec_q = x.sycl_queue + scalar = True + elif isinstance(repeats, dpt.usm_ndarray): + if repeats.ndim > 1: + raise ValueError( + "`repeats` array must be 0- or 1-dimensional, got " + f"{repeats.ndim}" + ) + exec_q = dpt.get_execution_queue((x.sycl_queue, repeats.sycl_queue)) + if exec_q is None: + raise dpt.ExecutionPlacementError( + "Execution placement can not be unambiguously inferred " + "from input arguments." + ) + usm_type = dpt.get_coerced_usm_type( + ( + x.usm_type, + repeats.usm_type, + ) + ) + dpt.validate_usm_type(usm_type, allow_none=False) + if not dpt.can_cast(repeats.dtype, dpt.int64, casting="same_kind"): + raise TypeError( + f"'repeats' data type {repeats.dtype} cannot be cast to " + "'int64' according to the casting rule ''safe.''" + ) + if repeats.size == 1: + scalar = True + # bring the single element to the host + if repeats.ndim == 0: + repeats = int(repeats) + else: + # Get the single element explicitly + # since non-0D arrays can not be converted to scalars + repeats = int(repeats[0]) + if repeats < 0: + raise ValueError("`repeats` elements must be positive") + else: + if repeats.size != axis_size: + raise ValueError( + "'repeats' array must be broadcastable to the size of " + "the repeated axis" + ) + if not dpt.all(repeats >= 0): + raise ValueError("'repeats' elements must be positive") + + elif isinstance(repeats, (tuple, list, range)): + usm_type = x.usm_type + exec_q = x.sycl_queue + + len_reps = len(repeats) + if len_reps == 1: + repeats = repeats[0] + if repeats < 0: + raise ValueError("`repeats` elements must be positive") + scalar = True + else: + if len_reps != axis_size: + raise ValueError( + "`repeats` sequence must have the same length as the " + "repeated axis" + ) + repeats = dpt.asarray( + repeats, dtype=dpt.int64, usm_type=usm_type, sycl_queue=exec_q + ) + if not dpt.all(repeats >= 0): + raise ValueError("`repeats` elements must be positive") + else: + raise TypeError( + "Expected int, sequence, or `usm_ndarray` for second argument," + f"got {type(repeats)}" + ) + + _manager = SequentialOrderManager[exec_q] + dep_evs = _manager.submitted_events + if scalar: + res_axis_size = repeats * axis_size + if axis is not None: + res_shape = x_shape[:axis] + (res_axis_size,) + x_shape[axis + 1 :] + else: + res_shape = (res_axis_size,) + res = dpt.empty( + res_shape, dtype=x.dtype, usm_type=usm_type, sycl_queue=exec_q + ) + if res_axis_size > 0: + ht_rep_ev, rep_ev = ti._repeat_by_scalar( + src=x, + dst=res, + reps=repeats, + axis=axis, + sycl_queue=exec_q, + depends=dep_evs, + ) + _manager.add_event_pair(ht_rep_ev, rep_ev) + else: + if repeats.dtype != dpt.int64: + rep_buf = dpt.empty( + repeats.shape, + dtype=dpt.int64, + usm_type=usm_type, + sycl_queue=exec_q, + ) + ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=repeats, dst=rep_buf, sycl_queue=exec_q, depends=dep_evs + ) + _manager.add_event_pair(ht_copy_ev, copy_ev) + cumsum = dpt.empty( + (axis_size,), + dtype=dpt.int64, + usm_type=usm_type, + sycl_queue=exec_q, + ) + # _cumsum_1d synchronizes so `depends` ends here safely + res_axis_size = ti._cumsum_1d( + rep_buf, cumsum, sycl_queue=exec_q, depends=[copy_ev] + ) + if axis is not None: + res_shape = ( + x_shape[:axis] + (res_axis_size,) + x_shape[axis + 1 :] + ) + else: + res_shape = (res_axis_size,) + res = dpt.empty( + res_shape, + dtype=x.dtype, + usm_type=usm_type, + sycl_queue=exec_q, + ) + if res_axis_size > 0: + ht_rep_ev, rep_ev = ti._repeat_by_sequence( + src=x, + dst=res, + reps=rep_buf, + cumsum=cumsum, + axis=axis, + sycl_queue=exec_q, + ) + _manager.add_event_pair(ht_rep_ev, rep_ev) + else: + cumsum = dpt.empty( + (axis_size,), + dtype=dpt.int64, + usm_type=usm_type, + sycl_queue=exec_q, + ) + res_axis_size = ti._cumsum_1d( + repeats, cumsum, sycl_queue=exec_q, depends=dep_evs + ) + if axis is not None: + res_shape = ( + x_shape[:axis] + (res_axis_size,) + x_shape[axis + 1 :] + ) + else: + res_shape = (res_axis_size,) + res = dpt.empty( + res_shape, + dtype=x.dtype, + usm_type=usm_type, + sycl_queue=exec_q, + ) + if res_axis_size > 0: + ht_rep_ev, rep_ev = ti._repeat_by_sequence( + src=x, + dst=res, + reps=repeats, + cumsum=cumsum, + axis=axis, + sycl_queue=exec_q, + ) + _manager.add_event_pair(ht_rep_ev, rep_ev) + return res + + +def roll(x, /, shift, *, axis=None): + """ + roll(x, shift, axis) + + Rolls array elements along a specified axis. + Array elements that roll beyond the last position are re-introduced + at the first position. Array elements that roll beyond the first position + are re-introduced at the last position. + + Args: + x (usm_ndarray): input array + shift (Union[int, Tuple[int,...]]): number of places by which the + elements are shifted. If `shift` is a tuple, then `axis` must be a + tuple of the same size, and each of the given axes must be shifted + by the corresponding element in `shift`. If `shift` is an `int` + and `axis` a tuple, then the same `shift` must be used for all + specified axes. If a `shift` is positive, then array elements is + shifted positively (toward larger indices) along the dimension of + `axis`. + If a `shift` is negative, then array elements must be shifted + negatively (toward smaller indices) along the dimension of `axis`. + axis (Optional[Union[int, Tuple[int,...]]]): axis (or axes) along which + elements to shift. If `axis` is `None`, the array is + flattened, shifted, and then restored to its original shape. + Default: `None`. + + Returns: + usm_ndarray: + An array having the same `dtype`, `usm_type` and + `device` attributes as `x` and whose elements are shifted relative + to `x`. + """ + if not isinstance(x, dpt.usm_ndarray): + raise TypeError(f"Expected usm_ndarray type, got {type(x)}.") + exec_q = x.sycl_queue + _manager = SequentialOrderManager[exec_q] + if axis is None: + shift = operator.index(shift) + res = dpt.empty( + x.shape, dtype=x.dtype, usm_type=x.usm_type, sycl_queue=exec_q + ) + sz = operator.index(x.size) + shift = (shift % sz) if sz > 0 else 0 + dep_evs = _manager.submitted_events + hev, roll_ev = ti._copy_usm_ndarray_for_roll_1d( + src=x, + dst=res, + shift=shift, + sycl_queue=exec_q, + depends=dep_evs, + ) + _manager.add_event_pair(hev, roll_ev) + return res + axis = normalize_axis_tuple(axis, x.ndim, allow_duplicate=True) + broadcasted = np.broadcast(shift, axis) + if broadcasted.ndim > 1: + raise ValueError("'shift' and 'axis' should be scalars or 1D sequences") + shifts = [ + 0, + ] * x.ndim + shape = x.shape + for sh, ax in broadcasted: + n_i = operator.index(shape[ax]) + shifted = shifts[ax] + operator.index(sh) + shifts[ax] = (shifted % n_i) if n_i > 0 else 0 + res = dpt.empty( + x.shape, dtype=x.dtype, usm_type=x.usm_type, sycl_queue=exec_q + ) + dep_evs = _manager.submitted_events + ht_e, roll_ev = ti._copy_usm_ndarray_for_roll_nd( + src=x, dst=res, shifts=shifts, sycl_queue=exec_q, depends=dep_evs + ) + _manager.add_event_pair(ht_e, roll_ev) + return res + + +def squeeze(X, /, axis=None): + """squeeze(x, axis) + + Removes singleton dimensions (axes) from array `x`. + + Args: + x (usm_ndarray): input array + axis (Union[int, Tuple[int,...]]): axis (or axes) to squeeze. + + Returns: + usm_ndarray: + Output array is a view, if possible, + and a copy otherwise, but with all or a subset of the + dimensions of length 1 removed. Output has the same data + type as the input, is allocated on the same device as the + input and has the same USM allocation type as the input + array `x`. + + Raises: + ValueError: if the specified axis has a size greater than one. + """ + if not isinstance(X, dpt.usm_ndarray): + raise TypeError(f"Expected usm_ndarray type, got {type(X)}.") + X_shape = X.shape + if axis is not None: + axis = normalize_axis_tuple(axis, X.ndim if X.ndim != 0 else X.ndim + 1) + new_shape = [] + for i, x in enumerate(X_shape): + if i not in axis: + new_shape.append(x) + else: + if x != 1: + raise ValueError( + "Cannot select an axis to squeeze out " + "which has size not equal to one." + ) + new_shape = tuple(new_shape) + else: + new_shape = tuple(axis for axis in X_shape if axis != 1) + if new_shape == X.shape: + return X + else: + return dpt.reshape(X, new_shape) + + +def stack(arrays, /, *, axis=0): + """ + stack(arrays, axis) + + Joins a sequence of arrays along a new axis. + + Args: + arrays (Union[List[usm_ndarray], Tuple[usm_ndarray,...]]): + input arrays to join. Each array must have the same shape. + axis (int): axis along which the arrays will be joined. Providing + an `axis` specified the index of the new axis in the dimensions + of the output array. A valid axis must be on the interval + `[-N, N)`, where `N` is the rank (number of dimensions) of `x`. + Default: `0`. + + Returns: + usm_ndarray: + An output array having rank `N+1`, where `N` is + the rank (number of dimensions) of `x`. If the input arrays have + different data types, array API Type Promotion Rules apply. + + Raises: + ValueError: if not all input arrays have the same shape + IndexError: if provided an `axis` outside of the required interval. + """ + res_dtype, res_usm_type, exec_q = _arrays_validation(arrays) + + n = len(arrays) + X0 = arrays[0] + res_ndim = X0.ndim + 1 + axis = normalize_axis_index(axis, res_ndim) + X0_shape = X0.shape + + for i in range(1, n): + if X0_shape != arrays[i].shape: + raise ValueError("All input arrays must have the same shape") + + res_shape = tuple( + X0_shape[i - 1 * (i >= axis)] if i != axis else n + for i in range(res_ndim) + ) + + res = dpt.empty( + res_shape, dtype=res_dtype, usm_type=res_usm_type, sycl_queue=exec_q + ) + + _manager = SequentialOrderManager[exec_q] + dep_evs = _manager.submitted_events + for i in range(n): + c_shapes_copy = tuple( + i if j == axis else np.s_[:] for j in range(res_ndim) + ) + _dst = res[c_shapes_copy] + hev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=arrays[i], dst=_dst, sycl_queue=exec_q, depends=dep_evs + ) + _manager.add_event_pair(hev, cpy_ev) + + return res + + +def swapaxes(X, axis1, axis2): + """swapaxes(x, axis1, axis2) + + Interchanges two axes of an array. + + Args: + x (usm_ndarray): input array + + axis1 (int): First axis. + If `x` has rank (i.e., number of dimensions) `N`, + a valid `axis` must be in the half-open interval `[-N, N)`. + + axis2 (int): Second axis. + If `x` has rank (i.e., number of dimensions) `N`, + a valid `axis` must be in the half-open interval `[-N, N)`. + + Returns: + usm_ndarray: + Array with swapped axes. + The returned array must has the same data type as `x`, + is created on the same device as `x` and has the same USM + allocation type as `x`. + + Raises: + AxisError: if `axis` value is invalid. + """ + if not isinstance(X, dpt.usm_ndarray): + raise TypeError(f"Expected usm_ndarray type, got {type(X)}.") + + axis1 = normalize_axis_index(axis1, X.ndim, "axis1") + axis2 = normalize_axis_index(axis2, X.ndim, "axis2") + + ind = list(range(0, X.ndim)) + ind[axis1] = axis2 + ind[axis2] = axis1 + return dpt.permute_dims(X, tuple(ind)) + + +def unstack(X, /, *, axis=0): + """unstack(x, axis=0) + + Splits an array in a sequence of arrays along the given axis. + + Args: + x (usm_ndarray): input array + + axis (int, optional): axis along which `x` is unstacked. + If `x` has rank (i.e, number of dimensions) `N`, + a valid `axis` must reside in the half-open interval `[-N, N)`. + Default: `0`. + + Returns: + Tuple[usm_ndarray,...]: + Output sequence of arrays which are views into the input array. + + Raises: + AxisError: if the `axis` value is invalid. + """ + if not isinstance(X, dpt.usm_ndarray): + raise TypeError(f"Expected usm_ndarray type, got {type(X)}.") + + axis = normalize_axis_index(axis, X.ndim) + Y = dpt.moveaxis(X, axis, 0) + + return tuple(Y[i] for i in range(Y.shape[0])) + + +def tile(x, repetitions, /): + """tile(x, repetitions) + + Repeat an input array `x` along each axis a number of times given by + `repetitions`. + + For `N` = len(`repetitions`) and `M` = len(`x.shape`): + + * If `M < N`, `x` will have `N - M` new axes prepended to its shape + * If `M > N`, `repetitions` will have `M - N` ones prepended to it + + Args: + x (usm_ndarray): input array + + repetitions (Union[int, Tuple[int, ...]]): + The number of repetitions along each dimension of `x`. + + Returns: + usm_ndarray: + tiled output array. + + The returned array will have rank `max(M, N)`. If `S` is the + shape of `x` after prepending dimensions and `R` is + `repetitions` after prepending ones, then the shape of the + result will be `S[i] * R[i]` for each dimension `i`. + + The returned array will have the same data type as `x`. + The returned array will be located on the same device as `x` and + have the same USM allocation type as `x`. + """ + if not isinstance(x, dpt.usm_ndarray): + raise TypeError(f"Expected usm_ndarray type, got {type(x)}.") + + if not isinstance(repetitions, tuple): + if isinstance(repetitions, int): + repetitions = (repetitions,) + else: + raise TypeError( + f"Expected tuple or integer type, got {type(repetitions)}." + ) + + rep_dims = len(repetitions) + x_dims = x.ndim + if rep_dims < x_dims: + repetitions = (x_dims - rep_dims) * (1,) + repetitions + elif x_dims < rep_dims: + x = dpt.reshape(x, (rep_dims - x_dims) * (1,) + x.shape) + res_shape = tuple(map(lambda sh, rep: sh * rep, x.shape, repetitions)) + # case of empty input + if x.size == 0: + return dpt.empty( + res_shape, + dtype=x.dtype, + usm_type=x.usm_type, + sycl_queue=x.sycl_queue, + ) + in_sh = x.shape + if res_shape == in_sh: + return dpt.copy(x) + expanded_sh = [] + broadcast_sh = [] + out_sz = 1 + for i in range(len(res_shape)): + out_sz *= res_shape[i] + reps, sh = repetitions[i], in_sh[i] + if reps == 1: + # dimension will be unchanged + broadcast_sh.append(sh) + expanded_sh.append(sh) + elif sh == 1: + # dimension will be broadcast + broadcast_sh.append(reps) + expanded_sh.append(sh) + else: + broadcast_sh.extend([reps, sh]) + expanded_sh.extend([1, sh]) + exec_q = x.sycl_queue + xdt = x.dtype + xut = x.usm_type + res = dpt.empty((out_sz,), dtype=xdt, usm_type=xut, sycl_queue=exec_q) + # no need to copy data for empty output + if out_sz > 0: + x = dpt.broadcast_to( + # this reshape should never copy + dpt.reshape(x, expanded_sh), + broadcast_sh, + ) + # copy broadcast input into flat array + _manager = SequentialOrderManager[exec_q] + dep_evs = _manager.submitted_events + hev, cp_ev = ti._copy_usm_ndarray_for_reshape( + src=x, dst=res, sycl_queue=exec_q, depends=dep_evs + ) + _manager.add_event_pair(hev, cp_ev) + return dpt.reshape(res, res_shape) diff --git a/dpnp/tensor/_numpy_helper.py b/dpnp/tensor/_numpy_helper.py new file mode 100644 index 000000000000..4ad735823cb3 --- /dev/null +++ b/dpnp/tensor/_numpy_helper.py @@ -0,0 +1,45 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + + +import numpy as np + +_npver = np.lib.NumpyVersion(np.__version__) + +if _npver < "1.25.0": # pragma: no cover + from numpy import AxisError +else: + from numpy.exceptions import AxisError + +if _npver >= "2.0.0": + from numpy._core.numeric import normalize_axis_index, normalize_axis_tuple +else: # pragma: no cover + from numpy.core.numeric import normalize_axis_index, normalize_axis_tuple + + +__all__ = ["AxisError", "normalize_axis_index", "normalize_axis_tuple"] diff --git a/dpnp/tensor/_print.py b/dpnp/tensor/_print.py new file mode 100644 index 000000000000..e39bf9041485 --- /dev/null +++ b/dpnp/tensor/_print.py @@ -0,0 +1,501 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import contextlib +import itertools +import operator + +import dpctl +import numpy as np +from dpctl.utils import SequentialOrderManager + +import dpnp.tensor as dpt +import dpnp.tensor._tensor_impl as ti + +__doc__ = "Print functions for :class:`dpctl.tensor.usm_ndarray`." + +_print_options = { + "linewidth": 75, + "edgeitems": 3, + "threshold": 1000, + "precision": 8, + "floatmode": "maxprec", + "suppress": False, + "nanstr": "nan", + "infstr": "inf", + "sign": "-", +} + + +def _move_to_next_line(string, s, line_width, prefix): + """Move string to next line if it doesn't fit in the current line.""" + bottom_len = len(s) - (s.rfind("\n") + 1) + next_line = bottom_len + len(string) + 1 > line_width + string = ",\n" + " " * len(prefix) + string if next_line else ", " + string + + return string + + +def _options_dict( + linewidth=None, + edgeitems=None, + threshold=None, + precision=None, + floatmode=None, + suppress=None, + nanstr=None, + infstr=None, + sign=None, + numpy=False, +): + if numpy: + numpy_options = np.get_printoptions() + options = {k: numpy_options[k] for k in _print_options.keys()} + else: + options = _print_options.copy() + + if suppress: + options["suppress"] = True + + local = dict(locals().items()) + for int_arg in ["linewidth", "precision", "threshold", "edgeitems"]: + val = local[int_arg] + if val is not None: + options[int_arg] = operator.index(val) + + for str_arg in ["nanstr", "infstr"]: + val = local[str_arg] + if val is not None: + if not isinstance(val, str): + raise TypeError( + "`{}` ".format(str_arg) + "must be of `string` type." + ) + options[str_arg] = val + + signs = ["-", "+", " "] + if sign is not None: + if sign not in signs: + raise ValueError( + "`sign` must be one of" + + ", ".join("`{}`".format(s) for s in signs) + ) + options["sign"] = sign + + floatmodes = ["fixed", "unique", "maxprec", "maxprec_equal"] + if floatmode is not None: + if floatmode not in floatmodes: + raise ValueError( + "`floatmode` must be one of" + + ", ".join("`{}`".format(m) for m in floatmodes) + ) + options["floatmode"] = floatmode + + return options + + +def set_print_options( + linewidth=None, + edgeitems=None, + threshold=None, + precision=None, + floatmode=None, + suppress=None, + nanstr=None, + infstr=None, + sign=None, + numpy=False, +): + """ + set_print_options(linewidth=None, edgeitems=None, threshold=None, + precision=None, floatmode=None, suppress=None, + nanstr=None, infstr=None, sign=None, numpy=False) + + Set options for printing :class:`dpctl.tensor.usm_ndarray` class. + + Args: + linewidth (int, optional): + Number of characters printed per line. + Raises `TypeError` if linewidth is not an integer. + Default: `75`. + edgeitems (int, optional): + Number of elements at the beginning and end + when the printed array is abbreviated. + Raises `TypeError` if edgeitems is not an integer. + Default: `3`. + threshold (int, optional): + Number of elements that triggers array abbreviation. + Raises `TypeError` if threshold is not an integer. + Default: `1000`. + precision (int or None, optional): + Number of digits printed for floating point numbers. + Raises `TypeError` if precision is not an integer. + Default: `8`. + floatmode (str, optional): + Controls how floating point numbers are interpreted. + `"fixed:`: + Always prints exactly `precision` digits. + `"unique"`: + Ignores precision, prints the number of + digits necessary to uniquely specify each number. + `"maxprec"`: + Prints `precision` digits or fewer, + if fewer will uniquely represent a number. + `"maxprec_equal"`: + Prints an equal number of digits + for each number. This number is `precision` digits + or fewer, if fewer will uniquely represent each number. + Raises `ValueError` if floatmode is not one of + `fixed`, `unique`, `maxprec`, or `maxprec_equal`. + Default: "maxprec_equal" + suppress (bool, optional): + If `True,` numbers equal to zero in the current precision + will print as zero. + Default: `False`. + nanstr (str, optional): + String used to represent nan. + Raises `TypeError` if nanstr is not a string. + Default: `"nan"`. + infstr (str, optional): + String used to represent infinity. + Raises `TypeError` if infstr is not a string. + Default: `"inf"`. + sign (str, optional): + Controls the sign of floating point numbers. + `"-"`: + Omit the sign of positive numbers. + `"+"`: + Always print the sign of positive numbers. + `" "`: + Always print a whitespace in place of the + sign of positive numbers. + Raises `ValueError` if sign is not one of + `"-"`, `"+"`, or `" "`. + Default: `"-"`. + numpy (bool, optional): If `True,` then before other specified print + options are set, a dictionary of Numpy's print options + will be used to initialize dpctl's print options. + Default: "False" + """ + options = _options_dict( + linewidth=linewidth, + edgeitems=edgeitems, + threshold=threshold, + precision=precision, + floatmode=floatmode, + suppress=suppress, + nanstr=nanstr, + infstr=infstr, + sign=sign, + numpy=numpy, + ) + _print_options.update(options) + + +def get_print_options(): + """get_print_options() + + Returns a copy of current options for printing + :class:`dpctl.tensor.usm_ndarray` class. + + Returns: + dict: dictionary with array + printing option settings. + + Options: + - "linewidth" : int, default 75 + - "edgeitems" : int, default 3 + - "threshold" : int, default 1000 + - "precision" : int, default 8 + - "floatmode" : str, default "maxprec_equal" + - "suppress" : bool, default False + - "nanstr" : str, default "nan" + - "infstr" : str, default "inf" + - "sign" : str, default "-" + """ + return _print_options.copy() + + +@contextlib.contextmanager +def print_options(*args, **kwargs): + """ + Context manager for print options. + + Set print options for the scope of a `with` block. + `as` yields dictionary of print options. + """ + options = dpt.get_print_options() + try: + dpt.set_print_options(*args, **kwargs) + yield dpt.get_print_options() + finally: + dpt.set_print_options(**options) + + +def _nd_corners(arr_in, edge_items): + _shape = arr_in.shape + max_shape = 2 * edge_items + 1 + if max(_shape) <= max_shape: + return dpt.asnumpy(arr_in) + res_shape = tuple( + max_shape if _shape[i] > max_shape else _shape[i] + for i in range(arr_in.ndim) + ) + + exec_q = arr_in.sycl_queue + arr_out = dpt.empty( + res_shape, + dtype=arr_in.dtype, + usm_type=arr_in.usm_type, + sycl_queue=exec_q, + ) + + blocks = [] + for i in range(len(_shape)): + if _shape[i] > max_shape: + blocks.append( + ( + np.s_[:edge_items], + np.s_[-edge_items:], + ) + ) + else: + blocks.append((np.s_[:],)) + + _manager = SequentialOrderManager[exec_q] + dep_evs = _manager.submitted_events + hev_list = [] + for slc in itertools.product(*blocks): + hev, _ = ti._copy_usm_ndarray_into_usm_ndarray( + src=arr_in[slc], + dst=arr_out[slc], + sycl_queue=exec_q, + depends=dep_evs, + ) + hev_list.append(hev) + + dpctl.SyclEvent.wait_for(hev_list) + return dpt.asnumpy(arr_out) + + +def usm_ndarray_str( + x, + line_width=None, + edge_items=None, + threshold=None, + precision=None, + floatmode=None, + suppress=None, + sign=None, + numpy=False, + separator=" ", + prefix="", + suffix="", +): + """ + usm_ndarray_str(x, line_width=None, edgeitems=None, threshold=None, + precision=None, floatmode=None, suppress=None, + sign=None, numpy=False, separator=" ", prefix="", + suffix="") + + Returns a string representing the elements of a + :class:`dpctl.tensor.usm_ndarray`. + + Args: + x (usm_ndarray): + Input array. + line_width (int, optional): + Number of characters printed per line. + Raises `TypeError` if line_width is not an integer. + Default: `75`. + edgeitems (int, optional): + Number of elements at the beginning and end + when the printed array is abbreviated. + Raises `TypeError` if edgeitems is not an integer. + Default: `3`. + threshold (int, optional): + Number of elements that triggers array abbreviation. + Raises `TypeError` if threshold is not an integer. + Default: `1000`. + precision (int or None, optional): + Number of digits printed for floating point numbers. + Raises `TypeError` if precision is not an integer. + Default: `8`. + floatmode (str, optional): + Controls how floating point numbers are interpreted. + `"fixed:`: + Always prints exactly `precision` digits. + `"unique"`: + Ignores precision, prints the number of + digits necessary to uniquely specify each number. + `"maxprec"`: + Prints `precision` digits or fewer, + if fewer will uniquely represent a number. + `"maxprec_equal"`: + Prints an equal number of digits for each number. + This number is `precision` digits or fewer, + if fewer will uniquely represent each number. + Raises `ValueError` if floatmode is not one of + `fixed`, `unique`, `maxprec`, or `maxprec_equal`. + Default: "maxprec_equal" + suppress (bool, optional): + If `True,` numbers equal to zero in the current precision + will print as zero. + Default: `False`. + sign (str, optional): + Controls the sign of floating point numbers. + `"-"`: + Omit the sign of positive numbers. + `"+"`: + Always print the sign of positive numbers. + `" "`: + Always print a whitespace in place of the + sign of positive numbers. + Raises `ValueError` if sign is not one of + `"-"`, `"+"`, or `" "`. + Default: `"-"`. + numpy (bool, optional): + If `True,` then before other specified print + options are set, a dictionary of Numpy's print options + will be used to initialize dpctl's print options. + Default: "False" + separator (str, optional): + String inserted between elements of the array string. + Default: " " + prefix (str, optional): + String used to determine spacing to the left of the array string. + Default: "" + suffix (str, optional): + String that determines length of the last line of the array string. + Default: "" + + Returns: + str: string representation of input array. + """ + if not isinstance(x, dpt.usm_ndarray): + raise TypeError(f"Expected dpnp.tensor.usm_ndarray, got {type(x)}") + + options = get_print_options() + options.update( + _options_dict( + linewidth=line_width, + edgeitems=edge_items, + threshold=threshold, + precision=precision, + floatmode=floatmode, + suppress=suppress, + sign=sign, + numpy=numpy, + ) + ) + + threshold = options["threshold"] + edge_items = options["edgeitems"] + + if x.size > threshold: + data = _nd_corners(x, edge_items) + options["threshold"] = 0 + else: + data = dpt.asnumpy(x) + with np.printoptions(**options): + s = np.array2string( + data, separator=separator, prefix=prefix, suffix=suffix + ) + return s + + +def usm_ndarray_repr( + x, line_width=None, precision=None, suppress=None, prefix="usm_ndarray" +): + """ + usm_ndarray_repr(x, line_width=None, precision=None, + suppress=None, prefix="") + + Returns a formatted string representing the elements + of a :class:`dpctl.tensor.usm_ndarray` and its data type, + if not a default type. + + Args: + x (usm_ndarray): Input array. + line_width (int, optional): Number of characters printed per line. + Raises `TypeError` if line_width is not an integer. + Default: `75`. + precision (int or None, optional): Number of digits printed for + floating point numbers. + Raises `TypeError` if precision is not an integer. + Default: `8`. + suppress (bool, optional): If `True,` numbers equal to zero + in the current precision will print as zero. + Default: `False`. + prefix (str, optional): String inserted at the start of the array + string. + Default: "" + + Returns: + str: formatted string representing the input array + """ + if not isinstance(x, dpt.usm_ndarray): + raise TypeError(f"Expected dpnp.tensor.usm_ndarray, got {type(x)}") + + if line_width is None: + line_width = _print_options["linewidth"] + + show_dtype = x.dtype not in [ + dpt.bool, + dpt.int64, + dpt.float64, + dpt.complex128, + ] + + prefix = prefix + "(" + suffix = ")" + + s = usm_ndarray_str( + x, + line_width=line_width, + precision=precision, + suppress=suppress, + separator=", ", + prefix=prefix, + suffix=suffix, + ) + + if show_dtype or x.size == 0: + dtype_str = f"dtype={x.dtype.name}" + dtype_str = _move_to_next_line(dtype_str, s, line_width, prefix) + else: + dtype_str = "" + + options = get_print_options() + threshold = options["threshold"] + if (x.size == 0 and x.shape != (0,)) or x.size > threshold: + shape_str = f"shape={x.shape}" + shape_str = _move_to_next_line(shape_str, s, line_width, prefix) + else: + shape_str = "" + + return prefix + s + shape_str + dtype_str + suffix diff --git a/dpnp/tensor/_reduction.py b/dpnp/tensor/_reduction.py new file mode 100644 index 000000000000..782fc2b0b442 --- /dev/null +++ b/dpnp/tensor/_reduction.py @@ -0,0 +1,830 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +from dpctl.utils import SequentialOrderManager + +import dpnp.tensor as dpt +import dpnp.tensor._tensor_impl as ti +import dpnp.tensor._tensor_reductions_impl as tri + +from ._numpy_helper import normalize_axis_tuple +from ._type_utils import ( + _default_accumulation_dtype, + _default_accumulation_dtype_fp_types, + _to_device_supported_dtype, +) + + +def _comparison_over_axis(x, axis, keepdims, out, _reduction_fn): + if not isinstance(x, dpt.usm_ndarray): + raise TypeError(f"Expected dpnp.tensor.usm_ndarray, got {type(x)}") + + nd = x.ndim + if axis is None: + axis = tuple(range(nd)) + perm = list(axis) + x_tmp = x + else: + if not isinstance(axis, (tuple, list)): + axis = (axis,) + axis = normalize_axis_tuple(axis, nd, "axis") + perm = [i for i in range(nd) if i not in axis] + list(axis) + x_tmp = dpt.permute_dims(x, perm) + red_nd = len(axis) + if any([x_tmp.shape[i] == 0 for i in range(-red_nd, 0)]): + raise ValueError("reduction cannot be performed over zero-size axes") + res_shape = x_tmp.shape[: nd - red_nd] + exec_q = x.sycl_queue + res_dt = x.dtype + res_usm_type = x.usm_type + + orig_out = out + if out is not None: + if not isinstance(out, dpt.usm_ndarray): + raise TypeError( + f"output array must be of usm_ndarray type, got {type(out)}" + ) + if not out.flags.writable: + raise ValueError("provided `out` array is read-only") + if not keepdims: + final_res_shape = res_shape + else: + inp_shape = x.shape + final_res_shape = tuple( + inp_shape[i] if i not in axis else 1 for i in range(nd) + ) + if not out.shape == final_res_shape: + raise ValueError( + "The shape of input and output arrays are inconsistent. " + f"Expected output shape is {final_res_shape}, got {out.shape}" + ) + if res_dt != out.dtype: + raise ValueError( + f"Output array of type {res_dt} is needed, got {out.dtype}" + ) + if dpt.get_execution_queue((exec_q, out.sycl_queue)) is None: + raise dpt.ExecutionPlacementError( + "Input and output allocation queues are not compatible" + ) + if keepdims: + out = dpt.squeeze(out, axis=axis) + orig_out = out + if ti._array_overlap(x, out): + out = dpt.empty_like(out) + else: + out = dpt.empty( + res_shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=exec_q + ) + + _manager = SequentialOrderManager[exec_q] + dep_evs = _manager.submitted_events + if red_nd == 0: + ht_e_cpy, cpy_e = ti._copy_usm_ndarray_into_usm_ndarray( + src=x_tmp, dst=out, sycl_queue=exec_q, depends=dep_evs + ) + _manager.add_event_pair(ht_e_cpy, cpy_e) + if not (orig_out is None or orig_out is out): + ht_e_cpy2, cpy2_e = ti._copy_usm_ndarray_into_usm_ndarray( + src=out, dst=orig_out, sycl_queue=exec_q, depends=[cpy_e] + ) + _manager.add_event_pair(ht_e_cpy2, cpy2_e) + out = orig_out + return out + + hev, red_ev = _reduction_fn( + src=x_tmp, + trailing_dims_to_reduce=red_nd, + dst=out, + sycl_queue=exec_q, + depends=dep_evs, + ) + _manager.add_event_pair(hev, red_ev) + if not (orig_out is None or orig_out is out): + ht_e_cpy2, cpy2_e = ti._copy_usm_ndarray_into_usm_ndarray( + src=out, dst=orig_out, sycl_queue=exec_q, depends=[red_ev] + ) + _manager.add_event_pair(ht_e_cpy2, cpy2_e) + out = orig_out + + if keepdims: + res_shape = res_shape + (1,) * red_nd + inv_perm = sorted(range(nd), key=lambda d: perm[d]) + out = dpt.permute_dims(dpt.reshape(out, res_shape), inv_perm) + return out + + +def _reduction_over_axis( + x, + axis, + dtype, + keepdims, + out, + _reduction_fn, + _dtype_supported, + _default_reduction_type_fn, +): + if not isinstance(x, dpt.usm_ndarray): + raise TypeError(f"Expected dpnp.tensor.usm_ndarray, got {type(x)}") + nd = x.ndim + if axis is None: + axis = tuple(range(nd)) + perm = list(axis) + arr = x + else: + if not isinstance(axis, (tuple, list)): + axis = (axis,) + axis = normalize_axis_tuple(axis, nd, "axis") + perm = [i for i in range(nd) if i not in axis] + list(axis) + arr = dpt.permute_dims(x, perm) + red_nd = len(axis) + res_shape = arr.shape[: nd - red_nd] + q = x.sycl_queue + inp_dt = x.dtype + if dtype is None: + res_dt = _default_reduction_type_fn(inp_dt, q) + else: + res_dt = dpt.dtype(dtype) + res_dt = _to_device_supported_dtype(res_dt, q.sycl_device) + + res_usm_type = x.usm_type + + implemented_types = _dtype_supported(inp_dt, res_dt, res_usm_type, q) + if dtype is None and not implemented_types: + raise RuntimeError( + "Automatically determined reduction data type does not " + "have direct implementation" + ) + orig_out = out + if out is not None: + if not isinstance(out, dpt.usm_ndarray): + raise TypeError( + f"output array must be of usm_ndarray type, got {type(out)}" + ) + if not out.flags.writable: + raise ValueError("provided `out` array is read-only") + if not keepdims: + final_res_shape = res_shape + else: + inp_shape = x.shape + final_res_shape = tuple( + inp_shape[i] if i not in axis else 1 for i in range(nd) + ) + if not out.shape == final_res_shape: + raise ValueError( + "The shape of input and output arrays are inconsistent. " + f"Expected output shape is {final_res_shape}, got {out.shape}" + ) + if res_dt != out.dtype: + raise ValueError( + f"Output array of type {res_dt} is needed, got {out.dtype}" + ) + if dpt.get_execution_queue((q, out.sycl_queue)) is None: + raise dpt.ExecutionPlacementError( + "Input and output allocation queues are not compatible" + ) + if keepdims: + out = dpt.squeeze(out, axis=axis) + orig_out = out + if ti._array_overlap(x, out) and implemented_types: + out = dpt.empty_like(out) + else: + out = dpt.empty( + res_shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=q + ) + + _manager = SequentialOrderManager[q] + dep_evs = _manager.submitted_events + if red_nd == 0: + ht_e_cpy, cpy_e = ti._copy_usm_ndarray_into_usm_ndarray( + src=arr, dst=out, sycl_queue=q, depends=dep_evs + ) + _manager.add_event_pair(ht_e_cpy, cpy_e) + if not (orig_out is None or orig_out is out): + ht_e_cpy2, cpy2_e = ti._copy_usm_ndarray_into_usm_ndarray( + src=out, dst=orig_out, sycl_queue=q, depends=[cpy_e] + ) + _manager.add_event_pair(ht_e_cpy2, cpy2_e) + out = orig_out + return out + + if implemented_types: + ht_e, red_e = _reduction_fn( + src=arr, + trailing_dims_to_reduce=red_nd, + dst=out, + sycl_queue=q, + depends=dep_evs, + ) + _manager.add_event_pair(ht_e, red_e) + if not (orig_out is None or orig_out is out): + ht_e_cpy, cpy_e = ti._copy_usm_ndarray_into_usm_ndarray( + src=out, dst=orig_out, sycl_queue=q, depends=[red_e] + ) + _manager.add_event_pair(ht_e_cpy, cpy_e) + out = orig_out + else: + if _dtype_supported(res_dt, res_dt, res_usm_type, q): + tmp = dpt.empty( + arr.shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=q + ) + ht_e_cpy, cpy_e = ti._copy_usm_ndarray_into_usm_ndarray( + src=arr, dst=tmp, sycl_queue=q, depends=dep_evs + ) + _manager.add_event_pair(ht_e_cpy, cpy_e) + ht_e_red, red_ev = _reduction_fn( + src=tmp, + trailing_dims_to_reduce=red_nd, + dst=out, + sycl_queue=q, + depends=[cpy_e], + ) + _manager.add_event_pair(ht_e_red, red_ev) + else: + buf_dt = _default_reduction_type_fn(inp_dt, q) + tmp = dpt.empty( + arr.shape, dtype=buf_dt, usm_type=res_usm_type, sycl_queue=q + ) + ht_e_cpy, cpy_e = ti._copy_usm_ndarray_into_usm_ndarray( + src=arr, dst=tmp, sycl_queue=q, depends=dep_evs + ) + _manager.add_event_pair(ht_e_cpy, cpy_e) + tmp_res = dpt.empty( + res_shape, dtype=buf_dt, usm_type=res_usm_type, sycl_queue=q + ) + ht_e_red, r_e = _reduction_fn( + src=tmp, + trailing_dims_to_reduce=red_nd, + dst=tmp_res, + sycl_queue=q, + depends=[cpy_e], + ) + _manager.add_event_pair(ht_e_red, r_e) + ht_e_cpy2, cpy2_e = ti._copy_usm_ndarray_into_usm_ndarray( + src=tmp_res, dst=out, sycl_queue=q, depends=[r_e] + ) + _manager.add_event_pair(ht_e_cpy2, cpy2_e) + + if keepdims: + res_shape = res_shape + (1,) * red_nd + inv_perm = sorted(range(nd), key=lambda d: perm[d]) + out = dpt.permute_dims(dpt.reshape(out, res_shape), inv_perm) + return out + + +def _search_over_axis(x, axis, keepdims, out, _reduction_fn): + if not isinstance(x, dpt.usm_ndarray): + raise TypeError(f"Expected dpnp.tensor.usm_ndarray, got {type(x)}") + + nd = x.ndim + if axis is None: + axis = tuple(range(nd)) + perm = list(axis) + x_tmp = x + else: + if isinstance(axis, int): + axis = (axis,) + else: + raise TypeError( + f"'axis' argument expected to have type 'int' " + r"or be `None`, " + f"got type {type(axis)}" + ) + axis = normalize_axis_tuple(axis, nd, "axis") + perm = [i for i in range(nd) if i not in axis] + list(axis) + x_tmp = dpt.permute_dims(x, perm) + axis = normalize_axis_tuple(axis, nd, "axis") + red_nd = len(axis) + if any([x_tmp.shape[i] == 0 for i in range(-red_nd, 0)]): + raise ValueError("reduction cannot be performed over zero-size axes") + res_shape = x_tmp.shape[: nd - red_nd] + exec_q = x.sycl_queue + res_dt = ti.default_device_index_type(exec_q.sycl_device) + res_usm_type = x.usm_type + + orig_out = out + if out is not None: + if not isinstance(out, dpt.usm_ndarray): + raise TypeError( + f"output array must be of usm_ndarray type, got {type(out)}" + ) + if not out.flags.writable: + raise ValueError("provided `out` array is read-only") + if not keepdims: + final_res_shape = res_shape + else: + inp_shape = x.shape + final_res_shape = tuple( + inp_shape[i] if i not in axis else 1 for i in range(nd) + ) + if not out.shape == final_res_shape: + raise ValueError( + "The shape of input and output arrays are inconsistent. " + f"Expected output shape is {final_res_shape}, got {out.shape}" + ) + if res_dt != out.dtype: + raise ValueError( + f"Output array of type {res_dt} is needed, got {out.dtype}" + ) + if dpt.get_execution_queue((exec_q, out.sycl_queue)) is None: + raise dpt.ExecutionPlacementError( + "Input and output allocation queues are not compatible" + ) + if keepdims: + out = dpt.squeeze(out, axis=axis) + orig_out = out + if ti._array_overlap(x, out) and red_nd > 0: + out = dpt.empty_like(out) + else: + out = dpt.empty( + res_shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=exec_q + ) + + _manager = SequentialOrderManager[exec_q] + dep_evs = _manager.submitted_events + if red_nd == 0: + ht_e_fill, fill_ev = ti._full_usm_ndarray( + fill_value=0, dst=out, sycl_queue=exec_q, depends=dep_evs + ) + _manager.add_event_pair(ht_e_fill, fill_ev) + return out + + hev, red_ev = _reduction_fn( + src=x_tmp, + trailing_dims_to_reduce=red_nd, + dst=out, + sycl_queue=exec_q, + depends=dep_evs, + ) + _manager.add_event_pair(hev, red_ev) + if not (orig_out is None or orig_out is out): + ht_e_cpy2, cpy2_e = ti._copy_usm_ndarray_into_usm_ndarray( + src=out, dst=orig_out, sycl_queue=exec_q, depends=[red_ev] + ) + _manager.add_event_pair(ht_e_cpy2, cpy2_e) + out = orig_out + + if keepdims: + res_shape = res_shape + (1,) * red_nd + inv_perm = sorted(range(nd), key=lambda d: perm[d]) + out = dpt.permute_dims(dpt.reshape(out, res_shape), inv_perm) + return out + + +def argmax(x, /, *, axis=None, keepdims=False, out=None): + """ + Returns the indices of the maximum values of the input array ``x`` along a + specified axis. + + When the maximum value occurs multiple times, the indices corresponding to + the first occurrence are returned. + + Args: + x (usm_ndarray): + input array. + axis (Optional[int]): + axis along which to search. If ``None``, returns the index of the + maximum value of the flattened array. + Default: ``None``. + keepdims (Optional[bool]): + if ``True``, the reduced axes (dimensions) are included in the + result as singleton dimensions, so that the returned array remains + compatible with the input arrays according to Array Broadcasting + rules. Otherwise, if ``False``, the reduced axes are not included + in the returned array. Default: ``False``. + out (Optional[usm_ndarray]): + the array into which the result is written. + The data type of ``out`` must match the expected shape and the + expected data type of the result. + If ``None`` then a new array is returned. Default: ``None``. + + Returns: + usm_ndarray: + an array containing the indices of the first occurrence of the + maximum values. If the entire array was searched, a + zero-dimensional array is returned. The returned array has the + default array index data type for the device of ``x``. + """ + return _search_over_axis(x, axis, keepdims, out, tri._argmax_over_axis) + + +def argmin(x, /, *, axis=None, keepdims=False, out=None): + """ + Returns the indices of the minimum values of the input array ``x`` along a + specified axis. + + When the minimum value occurs multiple times, the indices corresponding to + the first occurrence are returned. + + Args: + x (usm_ndarray): + input array. + axis (Optional[int]): + axis along which to search. If ``None``, returns the index of the + minimum value of the flattened array. + Default: ``None``. + keepdims (Optional[bool]): + if ``True``, the reduced axes (dimensions) are included in the + result as singleton dimensions, so that the returned array remains + compatible with the input arrays according to Array Broadcasting + rules. Otherwise, if ``False``, the reduced axes are not included + in the returned array. Default: ``False``. + out (Optional[usm_ndarray]): + the array into which the result is written. + The data type of ``out`` must match the expected shape and the + expected data type of the result. + If ``None`` then a new array is returned. Default: ``None``. + + Returns: + usm_ndarray: + an array containing the indices of the first occurrence of the + minimum values. If the entire array was searched, a + zero-dimensional array is returned. The returned array has the + default array index data type for the device of ``x``. + """ + return _search_over_axis(x, axis, keepdims, out, tri._argmin_over_axis) + + +def count_nonzero(x, /, *, axis=None, keepdims=False, out=None): + """ + Counts the number of elements in the input array ``x`` which are non-zero. + + Args: + x (usm_ndarray): + input array. + axis (Optional[int, Tuple[int, ...]]): + axis or axes along which to count. If a tuple of unique integers, + the number of non-zero values are computed over multiple axes. + If ``None``, the number of non-zero values is computed over the + entire array. + Default: ``None``. + keepdims (Optional[bool]): + if ``True``, the reduced axes (dimensions) are included in the + result as singleton dimensions, so that the returned array remains + compatible with the input arrays according to Array Broadcasting + rules. Otherwise, if ``False``, the reduced axes are not included + in the returned array. Default: ``False``. + out (Optional[usm_ndarray]): + the array into which the result is written. + The data type of ``out`` must match the expected shape and data + type. + If ``None`` then a new array is returned. Default: ``None``. + + Returns: + usm_ndarray: + an array containing the count of non-zero values. If the sum was + computed over the entire array, a zero-dimensional array is + returned. The returned array will have the default array index data + type. + """ + if x.dtype != dpt.bool: + x = dpt.astype(x, dpt.bool, copy=False) + return sum( + x, + axis=axis, + dtype=ti.default_device_index_type(x.sycl_device), + keepdims=keepdims, + out=out, + ) + + +def logsumexp(x, /, *, axis=None, dtype=None, keepdims=False, out=None): + """ + Calculates the logarithm of the sum of exponentials of elements in the + input array ``x``. + + Args: + x (usm_ndarray): + input array. + axis (Optional[int, Tuple[int, ...]]): + axis or axes along which values must be computed. If a tuple + of unique integers, values are computed over multiple axes. + If ``None``, the result is computed over the entire array. + Default: ``None``. + dtype (Optional[dtype]): + data type of the returned array. If ``None``, the default data + type is inferred from the "kind" of the input array data type. + + * If ``x`` has a real-valued floating-point data type, the + returned array will have the same data type as ``x``. + * If ``x`` has a boolean or integral data type, the returned array + will have the default floating point data type for the device + where input array ``x`` is allocated. + * If ``x`` has a complex-valued floating-point data type, + an error is raised. + + If the data type (either specified or resolved) differs from the + data type of ``x``, the input array elements are cast to the + specified data type before computing the result. + Default: ``None``. + keepdims (Optional[bool]): + if ``True``, the reduced axes (dimensions) are included in the + result as singleton dimensions, so that the returned array remains + compatible with the input arrays according to Array Broadcasting + rules. Otherwise, if ``False``, the reduced axes are not included + in the returned array. Default: ``False``. + out (Optional[usm_ndarray]): + the array into which the result is written. + The data type of ``out`` must match the expected shape and the + expected data type of the result or (if provided) ``dtype``. + If ``None`` then a new array is returned. Default: ``None``. + + Returns: + usm_ndarray: + an array containing the results. If the result was computed over + the entire array, a zero-dimensional array is returned. + The returned array has the data type as described in the + ``dtype`` parameter description above. + """ + return _reduction_over_axis( + x, + axis, + dtype, + keepdims, + out, + tri._logsumexp_over_axis, + lambda inp_dt, res_dt, *_: tri._logsumexp_over_axis_dtype_supported( + inp_dt, res_dt + ), + _default_accumulation_dtype_fp_types, + ) + + +def max(x, /, *, axis=None, keepdims=False, out=None): + """ + Calculates the maximum value of the input array ``x``. + + Args: + x (usm_ndarray): + input array. + axis (Optional[int, Tuple[int, ...]]): + axis or axes along which maxima must be computed. If a tuple + of unique integers, the maxima are computed over multiple axes. + If ``None``, the max is computed over the entire array. + Default: ``None``. + keepdims (Optional[bool]): + if ``True``, the reduced axes (dimensions) are included in the + result as singleton dimensions, so that the returned array remains + compatible with the input arrays according to Array Broadcasting + rules. Otherwise, if ``False``, the reduced axes are not included + in the returned array. Default: ``False``. + out (Optional[usm_ndarray]): + the array into which the result is written. + The data type of ``out`` must match the expected shape and the + expected data type of the result. + If ``None`` then a new array is returned. Default: ``None``. + + Returns: + usm_ndarray: + an array containing the maxima. If the max was computed over the + entire array, a zero-dimensional array is returned. The returned + array has the same data type as ``x``. + """ + return _comparison_over_axis(x, axis, keepdims, out, tri._max_over_axis) + + +def min(x, /, *, axis=None, keepdims=False, out=None): + """ + Calculates the minimum value of the input array ``x``. + + Args: + x (usm_ndarray): + input array. + axis (Optional[int, Tuple[int, ...]]): + axis or axes along which minima must be computed. If a tuple + of unique integers, the minima are computed over multiple axes. + If ``None``, the min is computed over the entire array. + Default: ``None``. + keepdims (Optional[bool]): + if ``True``, the reduced axes (dimensions) are included in the + result as singleton dimensions, so that the returned array remains + compatible with the input arrays according to Array Broadcasting + rules. Otherwise, if ``False``, the reduced axes are not included + in the returned array. Default: ``False``. + out (Optional[usm_ndarray]): + the array into which the result is written. + The data type of ``out`` must match the expected shape and the + expected data type of the result. + If ``None`` then a new array is returned. Default: ``None``. + + Returns: + usm_ndarray: + an array containing the minima. If the min was computed over the + entire array, a zero-dimensional array is returned. The returned + array has the same data type as ``x``. + """ + return _comparison_over_axis(x, axis, keepdims, out, tri._min_over_axis) + + +def prod(x, /, *, axis=None, dtype=None, keepdims=False, out=None): + """ + Calculates the product of elements in the input array ``x``. + + Args: + x (usm_ndarray): + input array. + axis (Optional[int, Tuple[int, ...]]): + axis or axes along which products must be computed. If a tuple + of unique integers, products are computed over multiple axes. + If ``None``, the product is computed over the entire array. + Default: ``None``. + dtype (Optional[dtype]): + data type of the returned array. If ``None``, the default data + type is inferred from the "kind" of the input array data type. + + * If ``x`` has a real- or complex-valued floating-point data + type, the returned array will have the same data type as + ``x``. + * If ``x`` has signed integral data type, the returned array + will have the default signed integral type for the device + where input array ``x`` is allocated. + * If ``x`` has unsigned integral data type, the returned array + will have the default unsigned integral type for the device + where input array ``x`` is allocated. + * If ``x`` has a boolean data type, the returned array will + have the default signed integral type for the device + where input array ``x`` is allocated. + + If the data type (either specified or resolved) differs from the + data type of ``x``, the input array elements are cast to the + specified data type before computing the product. + Default: ``None``. + keepdims (Optional[bool]): + if ``True``, the reduced axes (dimensions) are included in the + result as singleton dimensions, so that the returned array remains + compatible with the input arrays according to Array Broadcasting + rules. Otherwise, if ``False``, the reduced axes are not included + in the returned array. Default: ``False``. + out (Optional[usm_ndarray]): + the array into which the result is written. + The data type of ``out`` must match the expected shape and the + expected data type of the result or (if provided) ``dtype``. + If ``None`` then a new array is returned. Default: ``None``. + + Returns: + usm_ndarray: + an array containing the products. If the product was computed over + the entire array, a zero-dimensional array is returned. The + returned array has the data type as described in the ``dtype`` + parameter description above. + """ + return _reduction_over_axis( + x, + axis, + dtype, + keepdims, + out, + tri._prod_over_axis, + tri._prod_over_axis_dtype_supported, + _default_accumulation_dtype, + ) + + +def reduce_hypot(x, /, *, axis=None, dtype=None, keepdims=False, out=None): + """ + Calculates the square root of the sum of squares of elements in the input + array ``x``. + + Args: + x (usm_ndarray): + input array. + axis (Optional[int, Tuple[int, ...]]): + axis or axes along which values must be computed. If a tuple + of unique integers, values are computed over multiple axes. + If ``None``, the result is computed over the entire array. + Default: ``None``. + dtype (Optional[dtype]): + data type of the returned array. If ``None``, the default data + type is inferred from the "kind" of the input array data type. + + * If ``x`` has a real-valued floating-point data type, the + returned array will have the same data type as ``x``. + * If ``x`` has a boolean or integral data type, the returned array + will have the default floating point data type for the device + where input array ``x`` is allocated. + * If ``x`` has a complex-valued floating-point data type, + an error is raised. + + If the data type (either specified or resolved) differs from the + data type of ``x``, the input array elements are cast to the + specified data type before computing the result. Default: ``None``. + keepdims (Optional[bool]): + if ``True``, the reduced axes (dimensions) are included in the + result as singleton dimensions, so that the returned array remains + compatible with the input arrays according to Array Broadcasting + rules. Otherwise, if ``False``, the reduced axes are not included + in the returned array. Default: ``False``. + out (Optional[usm_ndarray]): + the array into which the result is written. + The data type of ``out`` must match the expected shape and the + expected data type of the result or (if provided) ``dtype``. + If ``None`` then a new array is returned. Default: ``None``. + + Returns: + usm_ndarray: + an array containing the results. If the result was computed over + the entire array, a zero-dimensional array is returned. The + returned array has the data type as described in the ``dtype`` + parameter description above. + """ + return _reduction_over_axis( + x, + axis, + dtype, + keepdims, + out, + tri._hypot_over_axis, + lambda inp_dt, res_dt, *_: tri._hypot_over_axis_dtype_supported( + inp_dt, res_dt + ), + _default_accumulation_dtype_fp_types, + ) + + +def sum(x, /, *, axis=None, dtype=None, keepdims=False, out=None): + """ + Calculates the sum of elements in the input array ``x``. + + Args: + x (usm_ndarray): + input array. + axis (Optional[int, Tuple[int, ...]]): + axis or axes along which sums must be computed. If a tuple + of unique integers, sums are computed over multiple axes. + If ``None``, the sum is computed over the entire array. + Default: ``None``. + dtype (Optional[dtype]): + data type of the returned array. If ``None``, the default data + type is inferred from the "kind" of the input array data type. + + * If ``x`` has a real- or complex-valued floating-point data + type, the returned array will have the same data type as + ``x``. + * If ``x`` has signed integral data type, the returned array + will have the default signed integral type for the device + where input array ``x`` is allocated. + * If ``x`` has unsigned integral data type, the returned array + will have the default unsigned integral type for the device + where input array ``x`` is allocated. + array ``x`` is allocated. + * If ``x`` has a boolean data type, the returned array will + have the default signed integral type for the device + where input array ``x`` is allocated. + + If the data type (either specified or resolved) differs from the + data type of ``x``, the input array elements are cast to the + specified data type before computing the sum. + Default: ``None``. + keepdims (Optional[bool]): + if ``True``, the reduced axes (dimensions) are included in the + result as singleton dimensions, so that the returned array remains + compatible with the input arrays according to Array Broadcasting + rules. Otherwise, if ``False``, the reduced axes are not included + in the returned array. Default: ``False``. + out (Optional[usm_ndarray]): + the array into which the result is written. + The data type of ``out`` must match the expected shape and the + expected data type of the result or (if provided) ``dtype``. + If ``None`` then a new array is returned. Default: ``None``. + + Returns: + usm_ndarray: + an array containing the sums. If the sum was computed over the + entire array, a zero-dimensional array is returned. The returned + array has the data type as described in the ``dtype`` parameter + description above. + """ + return _reduction_over_axis( + x, + axis, + dtype, + keepdims, + out, + tri._sum_over_axis, + tri._sum_over_axis_dtype_supported, + _default_accumulation_dtype, + ) diff --git a/dpnp/tensor/_reshape.py b/dpnp/tensor/_reshape.py new file mode 100644 index 000000000000..0187ae496003 --- /dev/null +++ b/dpnp/tensor/_reshape.py @@ -0,0 +1,208 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import operator + +import numpy as np +from dpctl.utils import SequentialOrderManager + +import dpnp.tensor as dpt + +from ._tensor_impl import ( + _copy_usm_ndarray_for_reshape, + _ravel_multi_index, + _unravel_index, +) + +__doc__ = "Implementation module for :func:`dpctl.tensor.reshape`." + + +def _make_unit_indexes(shape): + """ + Construct a diagonal matrix with with one on the diagonal + except if the corresponding element of shape is 1. + """ + nd = len(shape) + mi = np.zeros((nd, nd), dtype="u4") + for i, dim in enumerate(shape): + mi[i, i] = 1 if dim > 1 else 0 + return mi + + +def ti_unravel_index(flat_index, shape, order="C"): + return _unravel_index(flat_index, shape, order) + + +def ti_ravel_multi_index(multi_index, shape, order="C"): + return _ravel_multi_index(multi_index, shape, order) + + +def reshaped_strides(old_sh, old_sts, new_sh, order="C"): + """ + When reshaping array with `old_sh` shape and `old_sts` strides + into the new shape `new_sh`, returns the new stride if the reshape + can be a view, otherwise returns `None`. + """ + eye_new_mi = _make_unit_indexes(new_sh) + new_sts = [ + sum( + st_i * ind_i + for st_i, ind_i in zip( + old_sts, ti_unravel_index(flat_index, old_sh, order=order) + ) + ) + for flat_index in [ + ti_ravel_multi_index(unitvec, new_sh, order=order) + for unitvec in eye_new_mi + ] + ] + eye_old_mi = _make_unit_indexes(old_sh) + check_sts = [ + sum( + st_i * ind_i + for st_i, ind_i in zip( + new_sts, ti_unravel_index(flat_index, new_sh, order=order) + ) + ) + for flat_index in [ + ti_ravel_multi_index(unitvec, old_sh, order=order) + for unitvec in eye_old_mi + ] + ] + valid = all( + check_st == old_st or old_dim == 1 + for check_st, old_st, old_dim in zip(check_sts, old_sts, old_sh) + ) + return new_sts if valid else None + + +def reshape(X, /, shape, *, order="C", copy=None): + """reshape(x, shape, order="C") + + Reshapes array ``x`` into new shape. + + Args: + x (usm_ndarray): + input array + shape (Tuple[int]): + the desired shape of the resulting array. + order ("C", "F", optional): + memory layout of the resulting array + if a copy is found to be necessary. Supported + choices are ``"C"`` for C-contiguous, or row-major layout; + and ``"F"`` for F-contiguous, or column-major layout. + + Returns: + usm_ndarray: + Reshaped array is a view, if possible, + and a copy otherwise with memory layout as indicated + by ``order`` keyword. + """ + if not isinstance(X, dpt.usm_ndarray): + raise TypeError + if not isinstance(shape, (list, tuple)): + shape = (shape,) + if order in "cfCF": + order = order.upper() + else: + raise ValueError( + f"Keyword 'order' not recognized. Expecting 'C' or 'F', got {order}" + ) + if copy not in (True, False, None): + raise ValueError( + f"Keyword 'copy' not recognized. Expecting True, False, " + f"or None, got {copy}" + ) + shape = [operator.index(d) for d in shape] + negative_ones_count = 0 + for nshi in shape: + if nshi == -1: + negative_ones_count = negative_ones_count + 1 + if (nshi < -1) or negative_ones_count > 1: + raise ValueError( + "Target shape should have at most 1 negative " + "value which can only be -1" + ) + if negative_ones_count: + sz = -np.prod(shape) + if sz == 0: + raise ValueError( + f"Can not reshape array of size {X.size} into " + f"shape {tuple(i for i in shape if i >= 0)}" + ) + v = X.size // sz + shape = [v if d == -1 else d for d in shape] + if X.size != np.prod(shape): + raise ValueError(f"Can not reshape into {shape}") + if X.size: + newsts = reshaped_strides(X.shape, X.strides, shape, order=order) + else: + newsts = (1,) * len(shape) + copy_required = newsts is None + if copy_required and (copy is False): + raise ValueError( + "Reshaping the array requires a copy, but no copying was " + "requested by using copy=False" + ) + copy_q = X.sycl_queue + if copy_required or (copy is True): + # must perform a copy + copy_q = X.sycl_queue + flat_res = dpt.usm_ndarray( + (X.size,), + dtype=X.dtype, + buffer=X.usm_type, + buffer_ctor_kwargs={"queue": copy_q}, + ) + _manager = SequentialOrderManager[copy_q] + dep_evs = _manager.submitted_events + if order == "C": + hev, r_e = _copy_usm_ndarray_for_reshape( + src=X, dst=flat_res, sycl_queue=copy_q, depends=dep_evs + ) + else: + X_t = dpt.permute_dims(X, range(X.ndim - 1, -1, -1)) + hev, r_e = _copy_usm_ndarray_for_reshape( + src=X_t, dst=flat_res, sycl_queue=copy_q, depends=dep_evs + ) + _manager.add_event_pair(hev, r_e) + return dpt.usm_ndarray( + tuple(shape), dtype=X.dtype, buffer=flat_res, order=order + ) + # can form a view + if (len(shape) == X.ndim) and all( + s1 == s2 for s1, s2 in zip(shape, X.shape) + ): + return X + return dpt.usm_ndarray( + shape, + dtype=X.dtype, + buffer=X, + strides=tuple(newsts), + offset=X._element_offset, + ) diff --git a/dpnp/tensor/_scalar_utils.py b/dpnp/tensor/_scalar_utils.py new file mode 100644 index 000000000000..828f01f1c862 --- /dev/null +++ b/dpnp/tensor/_scalar_utils.py @@ -0,0 +1,123 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import numbers + +import dpctl.memory as dpm +import numpy as np + +import dpnp.tensor as dpt + +from ._type_utils import ( + WeakBooleanType, + WeakComplexType, + WeakFloatingType, + WeakIntegralType, + _to_device_supported_dtype, +) +from ._usmarray import _is_object_with_buffer_protocol as _is_buffer + + +def _get_queue_usm_type(o): + """Return SYCL device where object `o` allocated memory, or None.""" + if isinstance(o, dpt.usm_ndarray): + return o.sycl_queue, o.usm_type + elif hasattr(o, "__sycl_usm_array_interface__"): + try: + m = dpm.as_usm_memory(o) + return m.sycl_queue, m.get_usm_type() + except Exception: + return None, None + return None, None + + +def _get_dtype(o, dev): + if isinstance(o, dpt.usm_ndarray): + return o.dtype + if hasattr(o, "__sycl_usm_array_interface__"): + return dpt.asarray(o).dtype + if _is_buffer(o): + host_dt = np.array(o).dtype + dev_dt = _to_device_supported_dtype(host_dt, dev) + return dev_dt + if hasattr(o, "dtype"): + dev_dt = _to_device_supported_dtype(o.dtype, dev) + return dev_dt + if isinstance(o, bool): + return WeakBooleanType(o) + if isinstance(o, int): + return WeakIntegralType(o) + if isinstance(o, float): + return WeakFloatingType(o) + if isinstance(o, complex): + return WeakComplexType(o) + return np.object_ + + +def _validate_dtype(dt) -> bool: + return isinstance( + dt, + (WeakBooleanType, WeakIntegralType, WeakFloatingType, WeakComplexType), + ) or ( + isinstance(dt, dpt.dtype) + and dt + in [ + dpt.bool, + dpt.int8, + dpt.uint8, + dpt.int16, + dpt.uint16, + dpt.int32, + dpt.uint32, + dpt.int64, + dpt.uint64, + dpt.float16, + dpt.float32, + dpt.float64, + dpt.complex64, + dpt.complex128, + ] + ) + + +def _get_shape(o): + if isinstance(o, dpt.usm_ndarray): + return o.shape + if _is_buffer(o): + return memoryview(o).shape + if isinstance(o, numbers.Number): + return () + return getattr(o, "shape", tuple()) + + +__all__ = [ + "_get_dtype", + "_get_queue_usm_type", + "_get_shape", + "_validate_dtype", +] diff --git a/dpnp/tensor/_search_functions.py b/dpnp/tensor/_search_functions.py new file mode 100644 index 000000000000..c1d45ee4bb33 --- /dev/null +++ b/dpnp/tensor/_search_functions.py @@ -0,0 +1,415 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +from dpctl.utils import SequentialOrderManager + +import dpnp.tensor as dpt +import dpnp.tensor._tensor_impl as ti + +from ._copy_utils import _empty_like_orderK, _empty_like_triple_orderK +from ._manipulation_functions import _broadcast_shape_impl +from ._scalar_utils import ( + _get_dtype, + _get_queue_usm_type, + _get_shape, + _validate_dtype, +) +from ._type_utils import ( + WeakBooleanType, + WeakComplexType, + WeakFloatingType, + WeakIntegralType, + _all_data_types, + _can_cast, + _is_weak_dtype, + _strong_dtype_num_kind, + _to_device_supported_dtype, + _weak_type_num_kind, +) + + +def _default_dtype_from_weak_type(dt, dev): + if isinstance(dt, WeakBooleanType): + return dpt.bool + if isinstance(dt, WeakIntegralType): + return dpt.dtype(ti.default_device_int_type(dev)) + if isinstance(dt, WeakFloatingType): + return dpt.dtype(ti.default_device_fp_type(dev)) + if isinstance(dt, WeakComplexType): + return dpt.dtype(ti.default_device_complex_type(dev)) + + +def _resolve_two_weak_types(o1_dtype, o2_dtype, dev): + """Resolves two weak data types per NEP-0050""" + if _is_weak_dtype(o1_dtype): + if _is_weak_dtype(o2_dtype): + return _default_dtype_from_weak_type( + o1_dtype, dev + ), _default_dtype_from_weak_type(o2_dtype, dev) + o1_kind_num = _weak_type_num_kind(o1_dtype) + o2_kind_num = _strong_dtype_num_kind(o2_dtype) + if o1_kind_num > o2_kind_num: + if isinstance(o1_dtype, WeakIntegralType): + return dpt.dtype(ti.default_device_int_type(dev)), o2_dtype + if isinstance(o1_dtype, WeakComplexType): + if o2_dtype is dpt.float16 or o2_dtype is dpt.float32: + return dpt.complex64, o2_dtype + return ( + _to_device_supported_dtype(dpt.complex128, dev), + o2_dtype, + ) + return _to_device_supported_dtype(dpt.float64, dev), o2_dtype + else: + return o2_dtype, o2_dtype + elif _is_weak_dtype(o2_dtype): + o1_kind_num = _strong_dtype_num_kind(o1_dtype) + o2_kind_num = _weak_type_num_kind(o2_dtype) + if o2_kind_num > o1_kind_num: + if isinstance(o2_dtype, WeakIntegralType): + return o1_dtype, dpt.dtype(ti.default_device_int_type(dev)) + if isinstance(o2_dtype, WeakComplexType): + if o1_dtype is dpt.float16 or o1_dtype is dpt.float32: + return o1_dtype, dpt.complex64 + return o1_dtype, _to_device_supported_dtype(dpt.complex128, dev) + return ( + o1_dtype, + _to_device_supported_dtype(dpt.float64, dev), + ) + else: + return o1_dtype, o1_dtype + else: + return o1_dtype, o2_dtype + + +def _where_result_type(dt1, dt2, dev): + res_dtype = dpt.result_type(dt1, dt2) + fp16 = dev.has_aspect_fp16 + fp64 = dev.has_aspect_fp64 + + all_dts = _all_data_types(fp16, fp64) + if res_dtype in all_dts: + return res_dtype + else: + for res_dtype_ in all_dts: + if _can_cast(dt1, res_dtype_, fp16, fp64) and _can_cast( + dt2, res_dtype_, fp16, fp64 + ): + return res_dtype_ + return None + + +def where(condition, x1, x2, /, *, order="K", out=None): + """ + Returns :class:`dpctl.tensor.usm_ndarray` with elements chosen + from ``x1`` or ``x2`` depending on ``condition``. + + Args: + condition (usm_ndarray): When ``True`` yields from ``x1``, + and otherwise yields from ``x2``. + Must be compatible with ``x1`` and ``x2`` according + to broadcasting rules. + x1 (Union[usm_ndarray, bool, int, float, complex]): + Array from which values are chosen when ``condition`` is ``True``. + Must be compatible with ``condition`` and ``x2`` according + to broadcasting rules. + x2 (Union[usm_ndarray, bool, int, float, complex]): + Array from which values are chosen when ``condition`` is not + ``True``. + Must be compatible with ``condition`` and ``x2`` according + to broadcasting rules. + order (``"K"``, ``"C"``, ``"F"``, ``"A"``, optional): + Memory layout of the new output array, + if parameter ``out`` is ``None``. + Default: ``"K"``. + out (Optional[usm_ndarray]): + the array into which the result is written. + The data type of `out` must match the expected shape and the + expected data type of the result. + If ``None`` then a new array is returned. Default: ``None``. + + Returns: + usm_ndarray: + An array with elements from ``x1`` where ``condition`` is ``True``, + and elements from ``x2`` elsewhere. + + The data type of the returned array is determined by applying + the Type Promotion Rules to ``x1`` and ``x2``. + """ + if not isinstance(condition, dpt.usm_ndarray): + raise TypeError( + "Expecting dpnp.tensor.usm_ndarray type, " f"got {type(condition)}" + ) + if order not in ["K", "C", "F", "A"]: + order = "K" + q1, condition_usm_type = condition.sycl_queue, condition.usm_type + q2, x1_usm_type = _get_queue_usm_type(x1) + q3, x2_usm_type = _get_queue_usm_type(x2) + if q2 is None and q3 is None: + exec_q = q1 + out_usm_type = condition_usm_type + elif q3 is None: + exec_q = dpt.get_execution_queue((q1, q2)) + if exec_q is None: + raise dpt.ExecutionPlacementError( + "Execution placement can not be unambiguously inferred " + "from input arguments." + ) + out_usm_type = dpt.get_coerced_usm_type( + ( + condition_usm_type, + x1_usm_type, + ) + ) + elif q2 is None: + exec_q = dpt.get_execution_queue((q1, q3)) + if exec_q is None: + raise dpt.ExecutionPlacementError( + "Execution placement can not be unambiguously inferred " + "from input arguments." + ) + out_usm_type = dpt.get_coerced_usm_type( + ( + condition_usm_type, + x2_usm_type, + ) + ) + else: + exec_q = dpt.get_execution_queue((q1, q2, q3)) + if exec_q is None: + raise dpt.ExecutionPlacementError( + "Execution placement can not be unambiguously inferred " + "from input arguments." + ) + out_usm_type = dpt.get_coerced_usm_type( + ( + condition_usm_type, + x1_usm_type, + x2_usm_type, + ) + ) + dpt.validate_usm_type(out_usm_type, allow_none=False) + condition_shape = condition.shape + x1_shape = _get_shape(x1) + x2_shape = _get_shape(x2) + if not all( + isinstance(s, (tuple, list)) + for s in ( + x1_shape, + x2_shape, + ) + ): + raise TypeError( + "Shape of arguments can not be inferred. " + "Arguments are expected to be " + "lists, tuples, or both" + ) + try: + res_shape = _broadcast_shape_impl( + [ + condition_shape, + x1_shape, + x2_shape, + ] + ) + except ValueError: + raise ValueError( + "operands could not be broadcast together with shapes " + f"{condition_shape}, {x1_shape}, and {x2_shape}" + ) + sycl_dev = exec_q.sycl_device + x1_dtype = _get_dtype(x1, sycl_dev) + x2_dtype = _get_dtype(x2, sycl_dev) + if not all(_validate_dtype(o) for o in (x1_dtype, x2_dtype)): + raise ValueError("Operands have unsupported data types") + x1_dtype, x2_dtype = _resolve_two_weak_types(x1_dtype, x2_dtype, sycl_dev) + out_dtype = _where_result_type(x1_dtype, x2_dtype, sycl_dev) + if out_dtype is None: + raise TypeError( + "function 'where' does not support input " + f"types ({x1_dtype}, {x2_dtype}), " + "and the inputs could not be safely coerced " + "to any supported types according to the casting rule ''safe''." + ) + + orig_out = out + if out is not None: + if not isinstance(out, dpt.usm_ndarray): + raise TypeError( + "output array must be of usm_ndarray type, got " f"{type(out)}" + ) + + if not out.flags.writable: + raise ValueError("provided `out` array is read-only") + + if out.shape != res_shape: + raise ValueError( + "The shape of input and output arrays are " + f"inconsistent. Expected output shape is {res_shape}, " + f"got {out.shape}" + ) + + if out_dtype != out.dtype: + raise ValueError( + f"Output array of type {out_dtype} is needed, " + f"got {out.dtype}" + ) + + if dpt.get_execution_queue((exec_q, out.sycl_queue)) is None: + raise dpt.ExecutionPlacementError( + "Input and output allocation queues are not compatible" + ) + + if ti._array_overlap(condition, out) and not ti._same_logical_tensors( + condition, out + ): + out = dpt.empty_like(out) + + if isinstance(x1, dpt.usm_ndarray): + if ( + ti._array_overlap(x1, out) + and not ti._same_logical_tensors(x1, out) + and x1_dtype == out_dtype + ): + out = dpt.empty_like(out) + + if isinstance(x2, dpt.usm_ndarray): + if ( + ti._array_overlap(x2, out) + and not ti._same_logical_tensors(x2, out) + and x2_dtype == out_dtype + ): + out = dpt.empty_like(out) + + if order == "A": + order = ( + "F" + if all( + arr.flags.f_contiguous + for arr in ( + condition, + x1, + x2, + ) + ) + else "C" + ) + if not isinstance(x1, dpt.usm_ndarray): + x1 = dpt.asarray(x1, dtype=x1_dtype, sycl_queue=exec_q) + if not isinstance(x2, dpt.usm_ndarray): + x2 = dpt.asarray(x2, dtype=x2_dtype, sycl_queue=exec_q) + + if condition.size == 0: + if out is not None: + return out + else: + if order == "K": + return _empty_like_triple_orderK( + condition, + x1, + x2, + out_dtype, + res_shape, + out_usm_type, + exec_q, + ) + else: + return dpt.empty( + res_shape, + dtype=out_dtype, + order=order, + usm_type=out_usm_type, + sycl_queue=exec_q, + ) + + _manager = SequentialOrderManager[exec_q] + dep_evs = _manager.submitted_events + if x1_dtype != out_dtype: + if order == "K": + _x1 = _empty_like_orderK(x1, out_dtype) + else: + _x1 = dpt.empty_like(x1, dtype=out_dtype, order=order) + ht_copy1_ev, copy1_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=x1, dst=_x1, sycl_queue=exec_q, depends=dep_evs + ) + x1 = _x1 + _manager.add_event_pair(ht_copy1_ev, copy1_ev) + + if x2_dtype != out_dtype: + if order == "K": + _x2 = _empty_like_orderK(x2, out_dtype) + else: + _x2 = dpt.empty_like(x2, dtype=out_dtype, order=order) + ht_copy2_ev, copy2_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=x2, dst=_x2, sycl_queue=exec_q, depends=dep_evs + ) + x2 = _x2 + _manager.add_event_pair(ht_copy2_ev, copy2_ev) + + if out is None: + if order == "K": + out = _empty_like_triple_orderK( + condition, x1, x2, out_dtype, res_shape, out_usm_type, exec_q + ) + else: + out = dpt.empty( + res_shape, + dtype=out_dtype, + order=order, + usm_type=out_usm_type, + sycl_queue=exec_q, + ) + + if condition_shape != res_shape: + condition = dpt.broadcast_to(condition, res_shape) + if x1_shape != res_shape: + x1 = dpt.broadcast_to(x1, res_shape) + if x2_shape != res_shape: + x2 = dpt.broadcast_to(x2, res_shape) + + dep_evs = _manager.submitted_events + hev, where_ev = ti._where( + condition=condition, + x1=x1, + x2=x2, + dst=out, + sycl_queue=exec_q, + depends=dep_evs, + ) + _manager.add_event_pair(hev, where_ev) + if not (orig_out is None or orig_out is out): + # Copy the out data from temporary buffer to original memory + ht_copy_out_ev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=out, + dst=orig_out, + sycl_queue=exec_q, + depends=[where_ev], + ) + _manager.add_event_pair(ht_copy_out_ev, cpy_ev) + out = orig_out + + return out diff --git a/dpnp/tensor/_searchsorted.py b/dpnp/tensor/_searchsorted.py new file mode 100644 index 000000000000..4c9b54cb63fa --- /dev/null +++ b/dpnp/tensor/_searchsorted.py @@ -0,0 +1,189 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + + +from typing import Literal, Union + +import dpctl +import dpctl.utils as du + +from ._compute_follows_data import ( + ExecutionPlacementError, + get_coerced_usm_type, + get_execution_queue, +) +from ._copy_utils import _empty_like_orderK +from ._ctors import empty +from ._tensor_impl import _copy_usm_ndarray_into_usm_ndarray as ti_copy +from ._tensor_impl import _take as ti_take +from ._tensor_impl import ( + default_device_index_type as ti_default_device_index_type, +) +from ._tensor_sorting_impl import _searchsorted_left, _searchsorted_right +from ._type_utils import isdtype, result_type +from ._usmarray import usm_ndarray + + +def searchsorted( + x1: usm_ndarray, + x2: usm_ndarray, + /, + *, + side: Literal["left", "right"] = "left", + sorter: Union[usm_ndarray, None] = None, +) -> usm_ndarray: + """searchsorted(x1, x2, side='left', sorter=None) + + Finds the indices into `x1` such that, if the corresponding elements + in `x2` were inserted before the indices, the order of `x1`, when sorted + in ascending order, would be preserved. + + Args: + x1 (usm_ndarray): + input array. Must be a one-dimensional array. If `sorter` is + `None`, must be sorted in ascending order; otherwise, `sorter` must + be an array of indices that sort `x1` in ascending order. + x2 (usm_ndarray): + array containing search values. + side (Literal["left", "right]): + argument controlling which index is returned if a value lands + exactly on an edge. If `x2` is an array of rank `N` where + `v = x2[n, m, ..., j]`, the element `ret[n, m, ..., j]` in the + return array `ret` contains the position `i` such that + if `side="left"`, it is the first index such that + `x1[i-1] < v <= x1[i]`, `0` if `v <= x1[0]`, and `x1.size` + if `v > x1[-1]`; + and if `side="right"`, it is the first position `i` such that + `x1[i-1] <= v < x1[i]`, `0` if `v < x1[0]`, and `x1.size` + if `v >= x1[-1]`. Default: `"left"`. + sorter (Optional[usm_ndarray]): + array of indices that sort `x1` in ascending order. The array must + have the same shape as `x1` and have an integral data type. + Out of bound index values of `sorter` array are treated using + `"wrap"` mode documented in :py:func:`dpctl.tensor.take`. + Default: `None`. + """ + if not isinstance(x1, usm_ndarray): + raise TypeError(f"Expected dpnp.tensor.usm_ndarray, got {type(x1)}") + if not isinstance(x2, usm_ndarray): + raise TypeError(f"Expected dpnp.tensor.usm_ndarray, got {type(x2)}") + if sorter is not None and not isinstance(sorter, usm_ndarray): + raise TypeError(f"Expected dpnp.tensor.usm_ndarray, got {type(sorter)}") + + if side not in ["left", "right"]: + raise ValueError( + "Unrecognized value of 'side' keyword argument. " + "Expected either 'left' or 'right'" + ) + + if sorter is None: + q = get_execution_queue([x1.sycl_queue, x2.sycl_queue]) + else: + q = get_execution_queue( + [x1.sycl_queue, x2.sycl_queue, sorter.sycl_queue] + ) + if q is None: + raise ExecutionPlacementError( + "Execution placement can not be unambiguously " + "inferred from input arguments." + ) + + if x1.ndim != 1: + raise ValueError("First argument array must be one-dimensional") + + x1_dt = x1.dtype + x2_dt = x2.dtype + + _manager = du.SequentialOrderManager[q] + dep_evs = _manager.submitted_events + ev = dpctl.SyclEvent() + if sorter is not None: + if not isdtype(sorter.dtype, "integral"): + raise ValueError( + f"Sorter array must have integral data type, got {sorter.dtype}" + ) + if x1.shape != sorter.shape: + raise ValueError( + "Sorter array must be one-dimension with the same " + "shape as the first argument array" + ) + res = empty(x1.shape, dtype=x1_dt, usm_type=x1.usm_type, sycl_queue=q) + ind = (sorter,) + axis = 0 + wrap_out_of_bound_indices_mode = 0 + ht_ev, ev = ti_take( + x1, + ind, + res, + axis, + wrap_out_of_bound_indices_mode, + sycl_queue=q, + depends=dep_evs, + ) + x1 = res + _manager.add_event_pair(ht_ev, ev) + + if x1_dt != x2_dt: + dt = result_type(x1, x2) + if x1_dt != dt: + x1_buf = _empty_like_orderK(x1, dt) + dep_evs = _manager.submitted_events + ht_ev, ev = ti_copy( + src=x1, dst=x1_buf, sycl_queue=q, depends=dep_evs + ) + _manager.add_event_pair(ht_ev, ev) + x1 = x1_buf + if x2_dt != dt: + x2_buf = _empty_like_orderK(x2, dt) + dep_evs = _manager.submitted_events + ht_ev, ev = ti_copy( + src=x2, dst=x2_buf, sycl_queue=q, depends=dep_evs + ) + _manager.add_event_pair(ht_ev, ev) + x2 = x2_buf + + dst_usm_type = get_coerced_usm_type([x1.usm_type, x2.usm_type]) + index_dt = ti_default_device_index_type(q) + + dst = _empty_like_orderK(x2, index_dt, usm_type=dst_usm_type) + + dep_evs = _manager.submitted_events + if side == "left": + ht_ev, s_ev = _searchsorted_left( + hay=x1, + needles=x2, + positions=dst, + sycl_queue=q, + depends=dep_evs, + ) + else: + ht_ev, s_ev = _searchsorted_right( + hay=x1, needles=x2, positions=dst, sycl_queue=q, depends=dep_evs + ) + _manager.add_event_pair(ht_ev, s_ev) + return dst diff --git a/dpnp/tensor/_set_functions.py b/dpnp/tensor/_set_functions.py new file mode 100644 index 000000000000..067de75c42ce --- /dev/null +++ b/dpnp/tensor/_set_functions.py @@ -0,0 +1,794 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +from typing import NamedTuple, Optional, Union + +import dpctl.utils as du + +import dpnp.tensor as dpt + +from ._copy_utils import _empty_like_orderK +from ._scalar_utils import ( + _get_dtype, + _get_queue_usm_type, + _get_shape, + _validate_dtype, +) +from ._tensor_elementwise_impl import _not_equal, _subtract +from ._tensor_impl import ( + _copy_usm_ndarray_into_usm_ndarray, + _extract, + _full_usm_ndarray, + _linspace_step, + _take, + default_device_index_type, + mask_positions, +) +from ._tensor_sorting_impl import ( + _argsort_ascending, + _isin, + _searchsorted_left, + _sort_ascending, +) +from ._type_utils import ( + _resolve_weak_types_all_py_ints, + _to_device_supported_dtype, +) + +__all__ = [ + "isin", + "unique_values", + "unique_counts", + "unique_inverse", + "unique_all", + "UniqueAllResult", + "UniqueCountsResult", + "UniqueInverseResult", +] + + +class UniqueAllResult(NamedTuple): + values: dpt.usm_ndarray + indices: dpt.usm_ndarray + inverse_indices: dpt.usm_ndarray + counts: dpt.usm_ndarray + + +class UniqueCountsResult(NamedTuple): + values: dpt.usm_ndarray + counts: dpt.usm_ndarray + + +class UniqueInverseResult(NamedTuple): + values: dpt.usm_ndarray + inverse_indices: dpt.usm_ndarray + + +def unique_values(x: dpt.usm_ndarray) -> dpt.usm_ndarray: + """unique_values(x) + + Returns the unique elements of an input array `x`. + + Args: + x (usm_ndarray): + input array. Inputs with more than one dimension are flattened. + Returns: + usm_ndarray + an array containing the set of unique elements in `x`. The + returned array has the same data type as `x`. + """ + if not isinstance(x, dpt.usm_ndarray): + raise TypeError(f"Expected dpnp.tensor.usm_ndarray, got {type(x)}") + array_api_dev = x.device + exec_q = array_api_dev.sycl_queue + if x.ndim == 1: + fx = x + else: + fx = dpt.reshape(x, (x.size,), order="C") + if fx.size == 0: + return fx + s = dpt.empty_like(fx, order="C") + _manager = du.SequentialOrderManager[exec_q] + dep_evs = _manager.submitted_events + if fx.flags.c_contiguous: + ht_ev, sort_ev = _sort_ascending( + src=fx, + trailing_dims_to_sort=1, + dst=s, + sycl_queue=exec_q, + depends=dep_evs, + ) + _manager.add_event_pair(ht_ev, sort_ev) + else: + tmp = dpt.empty_like(fx, order="C") + ht_ev, copy_ev = _copy_usm_ndarray_into_usm_ndarray( + src=fx, dst=tmp, sycl_queue=exec_q, depends=dep_evs + ) + _manager.add_event_pair(ht_ev, copy_ev) + ht_ev, sort_ev = _sort_ascending( + src=tmp, + trailing_dims_to_sort=1, + dst=s, + sycl_queue=exec_q, + depends=[copy_ev], + ) + _manager.add_event_pair(ht_ev, sort_ev) + unique_mask = dpt.empty(fx.shape, dtype="?", sycl_queue=exec_q) + ht_ev, uneq_ev = _not_equal( + src1=s[:-1], + src2=s[1:], + dst=unique_mask[1:], + sycl_queue=exec_q, + depends=[sort_ev], + ) + _manager.add_event_pair(ht_ev, uneq_ev) + # writing into new allocation, no dependencies + ht_ev, one_ev = _full_usm_ndarray( + fill_value=True, dst=unique_mask[0], sycl_queue=exec_q + ) + _manager.add_event_pair(ht_ev, one_ev) + cumsum = dpt.empty(s.shape, dtype=dpt.int64, sycl_queue=exec_q) + # synchronizing call + n_uniques = mask_positions( + unique_mask, cumsum, sycl_queue=exec_q, depends=[one_ev, uneq_ev] + ) + if n_uniques == fx.size: + return s + unique_vals = dpt.empty( + n_uniques, dtype=x.dtype, usm_type=x.usm_type, sycl_queue=exec_q + ) + ht_ev, ex_e = _extract( + src=s, + cumsum=cumsum, + axis_start=0, + axis_end=1, + dst=unique_vals, + sycl_queue=exec_q, + ) + _manager.add_event_pair(ht_ev, ex_e) + return unique_vals + + +def unique_counts(x: dpt.usm_ndarray) -> UniqueCountsResult: + """unique_counts(x) + + Returns the unique elements of an input array `x` and the corresponding + counts for each unique element in `x`. + + Args: + x (usm_ndarray): + input array. Inputs with more than one dimension are flattened. + Returns: + tuple[usm_ndarray, usm_ndarray] + a namedtuple `(values, counts)` whose + + * first element is the field name `values` and is an array + containing the unique elements of `x`. This array has the + same data type as `x`. + * second element has the field name `counts` and is an array + containing the number of times each unique element occurs in `x`. + This array has the same shape as `values` and has the default + array index data type. + """ + if not isinstance(x, dpt.usm_ndarray): + raise TypeError(f"Expected dpnp.tensor.usm_ndarray, got {type(x)}") + array_api_dev = x.device + exec_q = array_api_dev.sycl_queue + x_usm_type = x.usm_type + if x.ndim == 1: + fx = x + else: + fx = dpt.reshape(x, (x.size,), order="C") + ind_dt = default_device_index_type(exec_q) + if fx.size == 0: + return UniqueCountsResult(fx, dpt.empty_like(fx, dtype=ind_dt)) + s = dpt.empty_like(fx, order="C") + + _manager = du.SequentialOrderManager[exec_q] + dep_evs = _manager.submitted_events + if fx.flags.c_contiguous: + ht_ev, sort_ev = _sort_ascending( + src=fx, + trailing_dims_to_sort=1, + dst=s, + sycl_queue=exec_q, + depends=dep_evs, + ) + _manager.add_event_pair(ht_ev, sort_ev) + else: + tmp = dpt.empty_like(fx, order="C") + ht_ev, copy_ev = _copy_usm_ndarray_into_usm_ndarray( + src=fx, dst=tmp, sycl_queue=exec_q, depends=dep_evs + ) + _manager.add_event_pair(ht_ev, copy_ev) + ht_ev, sort_ev = _sort_ascending( + src=tmp, + dst=s, + trailing_dims_to_sort=1, + sycl_queue=exec_q, + depends=[copy_ev], + ) + _manager.add_event_pair(ht_ev, sort_ev) + unique_mask = dpt.empty(s.shape, dtype="?", sycl_queue=exec_q) + ht_ev, uneq_ev = _not_equal( + src1=s[:-1], + src2=s[1:], + dst=unique_mask[1:], + sycl_queue=exec_q, + depends=[sort_ev], + ) + _manager.add_event_pair(ht_ev, uneq_ev) + # no dependency, since we write into new allocation + ht_ev, one_ev = _full_usm_ndarray( + fill_value=True, dst=unique_mask[0], sycl_queue=exec_q + ) + _manager.add_event_pair(ht_ev, one_ev) + cumsum = dpt.empty(unique_mask.shape, dtype=dpt.int64, sycl_queue=exec_q) + # synchronizing call + n_uniques = mask_positions( + unique_mask, cumsum, sycl_queue=exec_q, depends=[one_ev, uneq_ev] + ) + if n_uniques == fx.size: + return UniqueCountsResult( + s, + dpt.ones( + n_uniques, dtype=ind_dt, usm_type=x_usm_type, sycl_queue=exec_q + ), + ) + unique_vals = dpt.empty( + n_uniques, dtype=x.dtype, usm_type=x_usm_type, sycl_queue=exec_q + ) + # populate unique values + ht_ev, ex_e = _extract( + src=s, + cumsum=cumsum, + axis_start=0, + axis_end=1, + dst=unique_vals, + sycl_queue=exec_q, + ) + _manager.add_event_pair(ht_ev, ex_e) + unique_counts = dpt.empty( + n_uniques + 1, dtype=ind_dt, usm_type=x_usm_type, sycl_queue=exec_q + ) + idx = dpt.empty(x.size, dtype=ind_dt, sycl_queue=exec_q) + # writing into new allocation, no dependency + ht_ev, id_ev = _linspace_step(start=0, dt=1, dst=idx, sycl_queue=exec_q) + _manager.add_event_pair(ht_ev, id_ev) + ht_ev, extr_ev = _extract( + src=idx, + cumsum=cumsum, + axis_start=0, + axis_end=1, + dst=unique_counts[:-1], + sycl_queue=exec_q, + depends=[id_ev], + ) + _manager.add_event_pair(ht_ev, extr_ev) + # no dependency, writing into disjoint segmenent of new allocation + ht_ev, set_ev = _full_usm_ndarray( + x.size, dst=unique_counts[-1], sycl_queue=exec_q + ) + _manager.add_event_pair(ht_ev, set_ev) + _counts = dpt.empty_like(unique_counts[1:]) + ht_ev, sub_ev = _subtract( + src1=unique_counts[1:], + src2=unique_counts[:-1], + dst=_counts, + sycl_queue=exec_q, + depends=[set_ev, extr_ev], + ) + _manager.add_event_pair(ht_ev, sub_ev) + return UniqueCountsResult(unique_vals, _counts) + + +def unique_inverse(x): + """unique_inverse + + Returns the unique elements of an input array x and the indices from the + set of unique elements that reconstruct `x`. + + Args: + x (usm_ndarray): + input array. Inputs with more than one dimension are flattened. + Returns: + tuple[usm_ndarray, usm_ndarray] + a namedtuple `(values, inverse_indices)` whose + + * first element has the field name `values` and is an array + containing the unique elements of `x`. The array has the same + data type as `x`. + * second element has the field name `inverse_indices` and is an + array containing the indices of values that reconstruct `x`. + The array has the same shape as `x` and has the default array + index data type. + """ + if not isinstance(x, dpt.usm_ndarray): + raise TypeError(f"Expected dpnp.tensor.usm_ndarray, got {type(x)}") + array_api_dev = x.device + exec_q = array_api_dev.sycl_queue + x_usm_type = x.usm_type + ind_dt = default_device_index_type(exec_q) + if x.ndim == 1: + fx = x + else: + fx = dpt.reshape(x, (x.size,), order="C") + sorting_ids = dpt.empty_like(fx, dtype=ind_dt, order="C") + unsorting_ids = dpt.empty_like(sorting_ids, dtype=ind_dt, order="C") + if fx.size == 0: + return UniqueInverseResult(fx, dpt.reshape(unsorting_ids, x.shape)) + + _manager = du.SequentialOrderManager[exec_q] + dep_evs = _manager.submitted_events + if fx.flags.c_contiguous: + ht_ev, sort_ev = _argsort_ascending( + src=fx, + trailing_dims_to_sort=1, + dst=sorting_ids, + sycl_queue=exec_q, + depends=dep_evs, + ) + _manager.add_event_pair(ht_ev, sort_ev) + else: + tmp = dpt.empty_like(fx, order="C") + ht_ev, copy_ev = _copy_usm_ndarray_into_usm_ndarray( + src=fx, dst=tmp, sycl_queue=exec_q, depends=dep_evs + ) + _manager.add_event_pair(ht_ev, copy_ev) + ht_ev, sort_ev = _argsort_ascending( + src=tmp, + trailing_dims_to_sort=1, + dst=sorting_ids, + sycl_queue=exec_q, + depends=[copy_ev], + ) + _manager.add_event_pair(ht_ev, sort_ev) + ht_ev, argsort_ev = _argsort_ascending( + src=sorting_ids, + trailing_dims_to_sort=1, + dst=unsorting_ids, + sycl_queue=exec_q, + depends=[sort_ev], + ) + _manager.add_event_pair(ht_ev, argsort_ev) + s = dpt.empty_like(fx) + # s = fx[sorting_ids] + ht_ev, take_ev = _take( + src=fx, + ind=(sorting_ids,), + dst=s, + axis_start=0, + mode=0, + sycl_queue=exec_q, + depends=[sort_ev], + ) + _manager.add_event_pair(ht_ev, take_ev) + unique_mask = dpt.empty(fx.shape, dtype="?", sycl_queue=exec_q) + ht_ev, uneq_ev = _not_equal( + src1=s[:-1], + src2=s[1:], + dst=unique_mask[1:], + sycl_queue=exec_q, + depends=[take_ev], + ) + _manager.add_event_pair(ht_ev, uneq_ev) + # no dependency + ht_ev, one_ev = _full_usm_ndarray( + fill_value=True, dst=unique_mask[0], sycl_queue=exec_q + ) + _manager.add_event_pair(ht_ev, one_ev) + cumsum = dpt.empty(unique_mask.shape, dtype=dpt.int64, sycl_queue=exec_q) + # synchronizing call + n_uniques = mask_positions( + unique_mask, cumsum, sycl_queue=exec_q, depends=[uneq_ev, one_ev] + ) + if n_uniques == fx.size: + return UniqueInverseResult(s, dpt.reshape(unsorting_ids, x.shape)) + unique_vals = dpt.empty( + n_uniques, dtype=x.dtype, usm_type=x_usm_type, sycl_queue=exec_q + ) + ht_ev, uv_ev = _extract( + src=s, + cumsum=cumsum, + axis_start=0, + axis_end=1, + dst=unique_vals, + sycl_queue=exec_q, + ) + _manager.add_event_pair(ht_ev, uv_ev) + cum_unique_counts = dpt.empty( + n_uniques + 1, dtype=ind_dt, usm_type=x_usm_type, sycl_queue=exec_q + ) + idx = dpt.empty(x.size, dtype=ind_dt, sycl_queue=exec_q) + ht_ev, id_ev = _linspace_step(start=0, dt=1, dst=idx, sycl_queue=exec_q) + _manager.add_event_pair(ht_ev, id_ev) + ht_ev, extr_ev = _extract( + src=idx, + cumsum=cumsum, + axis_start=0, + axis_end=1, + dst=cum_unique_counts[:-1], + sycl_queue=exec_q, + depends=[id_ev], + ) + _manager.add_event_pair(ht_ev, extr_ev) + ht_ev, set_ev = _full_usm_ndarray( + x.size, dst=cum_unique_counts[-1], sycl_queue=exec_q + ) + _manager.add_event_pair(ht_ev, set_ev) + _counts = dpt.empty_like(cum_unique_counts[1:]) + ht_ev, sub_ev = _subtract( + src1=cum_unique_counts[1:], + src2=cum_unique_counts[:-1], + dst=_counts, + sycl_queue=exec_q, + depends=[set_ev, extr_ev], + ) + _manager.add_event_pair(ht_ev, sub_ev) + + inv = dpt.empty_like(x, dtype=ind_dt, order="C") + ht_ev, ssl_ev = _searchsorted_left( + hay=unique_vals, + needles=x, + positions=inv, + sycl_queue=exec_q, + depends=[ + uv_ev, + ], + ) + _manager.add_event_pair(ht_ev, ssl_ev) + + return UniqueInverseResult(unique_vals, inv) + + +def unique_all(x: dpt.usm_ndarray) -> UniqueAllResult: + """unique_all(x) + + Returns the unique elements of an input array `x`, the first occurring + indices for each unique element in `x`, the indices from the set of unique + elements that reconstruct `x`, and the corresponding counts for each + unique element in `x`. + + Args: + x (usm_ndarray): + input array. Inputs with more than one dimension are flattened. + Returns: + tuple[usm_ndarray, usm_ndarray, usm_ndarray, usm_ndarray] + a namedtuple `(values, indices, inverse_indices, counts)` whose + + * first element has the field name `values` and is an array + containing the unique elements of `x`. The array has the same + data type as `x`. + * second element has the field name `indices` and is an array + the indices (of first occurrences) of `x` that result in + `values`. The array has the same shape as `values` and has the + default array index data type. + * third element has the field name `inverse_indices` and is an + array containing the indices of values that reconstruct `x`. + The array has the same shape as `x` and has the default array + index data type. + * fourth element has the field name `counts` and is an array + containing the number of times each unique element occurs in `x`. + This array has the same shape as `values` and has the default + array index data type. + """ + if not isinstance(x, dpt.usm_ndarray): + raise TypeError(f"Expected dpnp.tensor.usm_ndarray, got {type(x)}") + array_api_dev = x.device + exec_q = array_api_dev.sycl_queue + x_usm_type = x.usm_type + ind_dt = default_device_index_type(exec_q) + if x.ndim == 1: + fx = x + else: + fx = dpt.reshape(x, (x.size,), order="C") + sorting_ids = dpt.empty_like(fx, dtype=ind_dt, order="C") + unsorting_ids = dpt.empty_like(sorting_ids, dtype=ind_dt, order="C") + if fx.size == 0: + # original array contains no data + # so it can be safely returned as values + return UniqueAllResult( + fx, + sorting_ids, + dpt.reshape(unsorting_ids, x.shape), + dpt.empty_like(fx, dtype=ind_dt), + ) + _manager = du.SequentialOrderManager[exec_q] + dep_evs = _manager.submitted_events + if fx.flags.c_contiguous: + ht_ev, sort_ev = _argsort_ascending( + src=fx, + trailing_dims_to_sort=1, + dst=sorting_ids, + sycl_queue=exec_q, + depends=dep_evs, + ) + _manager.add_event_pair(ht_ev, sort_ev) + else: + tmp = dpt.empty_like(fx, order="C") + ht_ev, copy_ev = _copy_usm_ndarray_into_usm_ndarray( + src=fx, dst=tmp, sycl_queue=exec_q, depends=dep_evs + ) + _manager.add_event_pair(ht_ev, copy_ev) + ht_ev, sort_ev = _argsort_ascending( + src=tmp, + trailing_dims_to_sort=1, + dst=sorting_ids, + sycl_queue=exec_q, + depends=[copy_ev], + ) + _manager.add_event_pair(ht_ev, sort_ev) + ht_ev, args_ev = _argsort_ascending( + src=sorting_ids, + trailing_dims_to_sort=1, + dst=unsorting_ids, + sycl_queue=exec_q, + depends=[sort_ev], + ) + _manager.add_event_pair(ht_ev, args_ev) + s = dpt.empty_like(fx) + # s = fx[sorting_ids] + ht_ev, take_ev = _take( + src=fx, + ind=(sorting_ids,), + dst=s, + axis_start=0, + mode=0, + sycl_queue=exec_q, + depends=[sort_ev], + ) + _manager.add_event_pair(ht_ev, take_ev) + unique_mask = dpt.empty(fx.shape, dtype="?", sycl_queue=exec_q) + ht_ev, uneq_ev = _not_equal( + src1=s[:-1], + src2=s[1:], + dst=unique_mask[1:], + sycl_queue=exec_q, + depends=[take_ev], + ) + _manager.add_event_pair(ht_ev, uneq_ev) + ht_ev, one_ev = _full_usm_ndarray( + fill_value=True, dst=unique_mask[0], sycl_queue=exec_q + ) + _manager.add_event_pair(ht_ev, one_ev) + cumsum = dpt.empty(unique_mask.shape, dtype=dpt.int64, sycl_queue=exec_q) + # synchronizing call + n_uniques = mask_positions( + unique_mask, cumsum, sycl_queue=exec_q, depends=[uneq_ev, one_ev] + ) + if n_uniques == fx.size: + _counts = dpt.ones( + n_uniques, dtype=ind_dt, usm_type=x_usm_type, sycl_queue=exec_q + ) + return UniqueAllResult( + s, + sorting_ids, + dpt.reshape(unsorting_ids, x.shape), + _counts, + ) + unique_vals = dpt.empty( + n_uniques, dtype=x.dtype, usm_type=x_usm_type, sycl_queue=exec_q + ) + ht_ev, uv_ev = _extract( + src=s, + cumsum=cumsum, + axis_start=0, + axis_end=1, + dst=unique_vals, + sycl_queue=exec_q, + ) + _manager.add_event_pair(ht_ev, uv_ev) + cum_unique_counts = dpt.empty( + n_uniques + 1, dtype=ind_dt, usm_type=x_usm_type, sycl_queue=exec_q + ) + idx = dpt.empty(x.size, dtype=ind_dt, sycl_queue=exec_q) + ht_ev, id_ev = _linspace_step(start=0, dt=1, dst=idx, sycl_queue=exec_q) + _manager.add_event_pair(ht_ev, id_ev) + ht_ev, extr_ev = _extract( + src=idx, + cumsum=cumsum, + axis_start=0, + axis_end=1, + dst=cum_unique_counts[:-1], + sycl_queue=exec_q, + depends=[id_ev], + ) + _manager.add_event_pair(ht_ev, extr_ev) + ht_ev, set_ev = _full_usm_ndarray( + x.size, dst=cum_unique_counts[-1], sycl_queue=exec_q + ) + _manager.add_event_pair(ht_ev, set_ev) + _counts = dpt.empty_like(cum_unique_counts[1:]) + ht_ev, sub_ev = _subtract( + src1=cum_unique_counts[1:], + src2=cum_unique_counts[:-1], + dst=_counts, + sycl_queue=exec_q, + depends=[set_ev, extr_ev], + ) + _manager.add_event_pair(ht_ev, sub_ev) + + inv = dpt.empty_like(x, dtype=ind_dt, order="C") + ht_ev, ssl_ev = _searchsorted_left( + hay=unique_vals, + needles=x, + positions=inv, + sycl_queue=exec_q, + depends=[ + uv_ev, + ], + ) + _manager.add_event_pair(ht_ev, ssl_ev) + return UniqueAllResult( + unique_vals, + sorting_ids[cum_unique_counts[:-1]], + inv, + _counts, + ) + + +def isin( + x: Union[dpt.usm_ndarray, int, float, complex, bool], + test_elements: Union[dpt.usm_ndarray, int, float, complex, bool], + /, + *, + invert: Optional[bool] = False, +) -> dpt.usm_ndarray: + """isin(x, test_elements, /, *, invert=False) + + Tests `x in test_elements` for each element of `x`. Returns a boolean array + with the same shape as `x` that is `True` where the element is in + `test_elements`, `False` otherwise. + + Args: + x (Union[usm_ndarray, bool, int, float, complex]): + input element or elements. + test_elements (Union[usm_ndarray, bool, int, float, complex]): + elements against which to test each value of `x`. + invert (Optional[bool]): + if `True`, the output results are inverted, i.e., are equivalent to + testing `x not in test_elements` for each element of `x`. + Default: `False`. + + Returns: + usm_ndarray: + an array of the inclusion test results. The returned array has a + boolean data type and the same shape as `x`. + """ + q1, x_usm_type = _get_queue_usm_type(x) + q2, test_usm_type = _get_queue_usm_type(test_elements) + if q1 is None and q2 is None: + raise dpt.ExecutionPlacementError( + "Execution placement can not be unambiguously inferred " + "from input arguments. " + "One of the arguments must represent USM allocation and " + "expose `__sycl_usm_array_interface__` property" + ) + if q1 is None: + exec_q = q2 + res_usm_type = test_usm_type + elif q2 is None: + exec_q = q1 + res_usm_type = x_usm_type + else: + exec_q = dpt.get_execution_queue((q1, q2)) + if exec_q is None: + raise dpt.ExecutionPlacementError( + "Execution placement can not be unambiguously inferred " + "from input arguments." + ) + res_usm_type = dpt.get_coerced_usm_type( + ( + x_usm_type, + test_usm_type, + ) + ) + dpt.validate_usm_type(res_usm_type, allow_none=False) + sycl_dev = exec_q.sycl_device + + if not isinstance(invert, bool): + raise TypeError( + "`invert` keyword argument must be of boolean type, " + f"got {type(invert)}" + ) + + x_dt = _get_dtype(x, sycl_dev) + test_dt = _get_dtype(test_elements, sycl_dev) + if not all(_validate_dtype(dt) for dt in (x_dt, test_dt)): + raise ValueError("Operands have unsupported data types") + + x_sh = _get_shape(x) + if isinstance(test_elements, dpt.usm_ndarray) and test_elements.size == 0: + if invert: + return dpt.ones( + x_sh, dtype=dpt.bool, usm_type=res_usm_type, sycl_queue=exec_q + ) + else: + return dpt.zeros( + x_sh, dtype=dpt.bool, usm_type=res_usm_type, sycl_queue=exec_q + ) + + dt1, dt2 = _resolve_weak_types_all_py_ints(x_dt, test_dt, sycl_dev) + dt = _to_device_supported_dtype(dpt.result_type(dt1, dt2), sycl_dev) + + if not isinstance(x, dpt.usm_ndarray): + x_arr = dpt.asarray( + x, dtype=dt1, usm_type=res_usm_type, sycl_queue=exec_q + ) + else: + x_arr = x + + if not isinstance(test_elements, dpt.usm_ndarray): + test_arr = dpt.asarray( + test_elements, dtype=dt2, usm_type=res_usm_type, sycl_queue=exec_q + ) + else: + test_arr = test_elements + + _manager = du.SequentialOrderManager[exec_q] + dep_evs = _manager.submitted_events + + if x_dt != dt: + x_buf = _empty_like_orderK(x_arr, dt, res_usm_type, exec_q) + ht_ev, ev = _copy_usm_ndarray_into_usm_ndarray( + src=x_arr, dst=x_buf, sycl_queue=exec_q, depends=dep_evs + ) + _manager.add_event_pair(ht_ev, ev) + else: + x_buf = x_arr + + if test_dt != dt: + # copy into C-contiguous memory, because the array will be flattened + test_buf = dpt.empty_like( + test_arr, dtype=dt, order="C", usm_type=res_usm_type + ) + ht_ev, ev = _copy_usm_ndarray_into_usm_ndarray( + src=test_arr, dst=test_buf, sycl_queue=exec_q, depends=dep_evs + ) + _manager.add_event_pair(ht_ev, ev) + else: + test_buf = test_arr + + test_buf = dpt.reshape(test_buf, -1) + test_buf = dpt.sort(test_buf) + + dst = dpt.empty_like( + x_buf, dtype=dpt.bool, usm_type=res_usm_type, order="C" + ) + + dep_evs = _manager.submitted_events + ht_ev, s_ev = _isin( + needles=x_buf, + hay=test_buf, + dst=dst, + sycl_queue=exec_q, + invert=invert, + depends=dep_evs, + ) + _manager.add_event_pair(ht_ev, s_ev) + return dst diff --git a/dpnp/tensor/_slicing.pxi b/dpnp/tensor/_slicing.pxi new file mode 100644 index 000000000000..f387aef8afd8 --- /dev/null +++ b/dpnp/tensor/_slicing.pxi @@ -0,0 +1,383 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import numbers +from operator import index +from cpython.buffer cimport PyObject_CheckBuffer +from numpy import ndarray + + +cdef bint _is_buffer(object o): + return PyObject_CheckBuffer(o) + + +cdef Py_ssize_t _slice_len( + Py_ssize_t sl_start, + Py_ssize_t sl_stop, + Py_ssize_t sl_step +): + """ + Compute len(range(sl_start, sl_stop, sl_step)) + """ + if sl_start == sl_stop: + return 0 + if sl_step > 0: + if sl_start > sl_stop: + return 0 + # 1 + argmax k such htat sl_start + sl_step*k < sl_stop + return 1 + ((sl_stop - sl_start - 1) // sl_step) + else: + if sl_start < sl_stop: + return 0 + return 1 + ((sl_stop - sl_start + 1) // sl_step) + + +cdef bint _is_integral(object x) except *: + """Gives True if x is an integral slice spec""" + if isinstance(x, (ndarray, usm_ndarray)): + if x.ndim > 0: + return False + if x.dtype.kind not in "ui": + return False + return True + if isinstance(x, bool): + return False + if isinstance(x, int): + return True + if _is_buffer(x): + mbuf = memoryview(x) + if mbuf.ndim == 0: + f = mbuf.format + return f in "bBhHiIlLqQ" + else: + return False + if callable(getattr(x, "__index__", None)): + try: + index(x) + except (TypeError, ValueError): + return False + return True + return False + + +cdef bint _is_boolean(object x) except *: + """Gives True if x is an integral slice spec""" + if isinstance(x, (ndarray, usm_ndarray)): + if x.ndim > 0: + return False + if x.dtype.kind not in "b": + return False + return True + if isinstance(x, bool): + return True + if isinstance(x, (int, float, complex)): + return False + if _is_buffer(x): + mbuf = memoryview(x) + if mbuf.ndim == 0: + f = mbuf.format + return f in "?" + else: + return False + if callable(getattr(x, "__bool__", None)): + try: + x.__bool__() + except (TypeError, ValueError): + return False + return True + return False + + +def _basic_slice_meta(ind, shape : tuple, strides : tuple, offset : int): + """ + Give basic slicing index `ind` and array layout information produce + a 5-tuple (resulting_shape, resulting_strides, resulting_offset, + advanced_ind, resulting_advanced_ind_pos) + used to construct a view into underlying array over which advanced + indexing, if any, is to be performed. + + Raises IndexError for invalid index `ind`. + """ + _no_advanced_ind = tuple() + _no_advanced_pos = -1 + if ind is Ellipsis: + return (shape, strides, offset, _no_advanced_ind, _no_advanced_pos) + elif ind is None: + return ( + (1,) + shape, + (0,) + strides, + offset, + _no_advanced_ind, + _no_advanced_pos, + ) + elif isinstance(ind, slice): + sl_start, sl_stop, sl_step = ind.indices(shape[0]) + sh0 = _slice_len(sl_start, sl_stop, sl_step) + str0 = sl_step * strides[0] + new_strides = ( + strides if (sl_step == 1 or sh0 == 0) else (str0,) + strides[1:] + ) + new_shape = (sh0, ) + shape[1:] + is_empty = any(sh_i == 0 for sh_i in new_shape) + new_offset = offset if is_empty else offset + sl_start * strides[0] + return ( + new_shape, + new_strides, + new_offset, + _no_advanced_ind, + _no_advanced_pos, + ) + elif _is_boolean(ind): + if ind: + return ( + (1,) + shape, + (0,) + strides, + offset, + _no_advanced_ind, + _no_advanced_pos, + ) + else: + return ( + (0,) + shape, + (0,) + strides, + offset, + _no_advanced_ind, + _no_advanced_pos, + ) + elif _is_integral(ind): + ind = index(ind) + new_shape = shape[1:] + new_strides = strides[1:] + is_empty = any(sh_i == 0 for sh_i in new_shape) + if 0 <= ind < shape[0]: + new_offset = offset if is_empty else offset + ind * strides[0] + return ( + new_shape, + new_strides, + new_offset, + _no_advanced_ind, + _no_advanced_pos, + ) + elif -shape[0] <= ind < 0: + new_offset = ( + offset if is_empty else offset + (shape[0] + ind) * strides[0] + ) + return ( + new_shape, + new_strides, + new_offset, + _no_advanced_ind, + _no_advanced_pos, + ) + else: + raise IndexError( + "Index {0} is out of range for axes 0 with " + "size {1}".format(ind, shape[0])) + elif isinstance(ind, (ndarray, usm_ndarray)): + return (shape, strides, offset, (ind,), 0) + elif isinstance(ind, tuple): + axes_referenced = 0 + ellipses_count = 0 + newaxis_count = 0 + explicit_index = 0 + seen_arrays_yet = False + array_streak_started = False + array_streak_interrupted = False + for i in ind: + if i is None: + newaxis_count += 1 + if array_streak_started: + array_streak_interrupted = True + elif i is Ellipsis: + ellipses_count += 1 + if array_streak_started: + array_streak_interrupted = True + elif isinstance(i, slice): + axes_referenced += 1 + if array_streak_started: + array_streak_interrupted = True + elif _is_boolean(i): + newaxis_count += 1 + if array_streak_started: + array_streak_interrupted = True + elif _is_integral(i): + axes_referenced += 1 + if not array_streak_started and array_streak_interrupted: + explicit_index += 1 + elif isinstance(i, (ndarray, usm_ndarray)): + if not seen_arrays_yet: + seen_arrays_yet = True + array_streak_started = True + array_streak_interrupted = False + if array_streak_interrupted: + raise IndexError( + "Advanced indexing array specs may not be " + "separated by basic slicing specs." + ) + dt_k = i.dtype.kind + if dt_k == "b" and i.ndim > 0: + axes_referenced += i.ndim + elif dt_k in "ui" and i.ndim > 0: + axes_referenced += 1 + else: + raise IndexError( + "arrays used as indices must be of integer " + "(or boolean) type" + ) + else: + raise IndexError( + "Only integers, slices (`:`), ellipsis (`...`), " + "dpnp.tensor.newaxis (`None`) and integer and " + "boolean arrays are valid indices." + ) + if ellipses_count > 1: + raise IndexError( + "an index can only have a single ellipsis ('...')") + if axes_referenced > len(shape): + raise IndexError( + "too many indices for an array, array is " + "{0}-dimensional, but {1} were indexed".format( + len(shape), axes_referenced)) + if ellipses_count: + ellipses_count = len(shape) - axes_referenced + new_shape_len = (newaxis_count + ellipses_count + + axes_referenced - explicit_index) + new_shape = list() + new_strides = list() + new_advanced_ind = list() + k = 0 + new_advanced_start_pos = -1 + advanced_start_pos_set = False + new_offset = offset + is_empty = False + array_streak = False + for i in range(len(ind)): + ind_i = ind[i] + if (ind_i is Ellipsis): + k_new = k + ellipses_count + new_shape.extend(shape[k:k_new]) + new_strides.extend(strides[k:k_new]) + if any(dim == 0 for dim in shape[k:k_new]): + is_empty = True + new_offset = offset + k = k_new + if array_streak: + array_streak = False + elif ind_i is None: + new_shape.append(1) + new_strides.append(0) + if array_streak: + array_streak = False + elif isinstance(ind_i, slice): + k_new = k + 1 + sl_start, sl_stop, sl_step = ind_i.indices(shape[k]) + sh_i = _slice_len(sl_start, sl_stop, sl_step) + str_i = (1 if sh_i == 0 else sl_step) * strides[k] + new_shape.append(sh_i) + new_strides.append(str_i) + if sh_i > 0 and not is_empty: + new_offset = new_offset + sl_start * strides[k] + if sh_i == 0: + is_empty = True + new_offset = offset + k = k_new + if array_streak: + array_streak = False + elif _is_boolean(ind_i): + new_shape.append(1 if ind_i else 0) + new_strides.append(0) + if array_streak: + array_streak = False + elif _is_integral(ind_i): + if array_streak: + if not isinstance(ind_i, (ndarray, usm_ndarray)): + ind_i = index(ind_i) + # integer will be converted to an array, + # still raise if OOB + if not ( + 0 <= ind_i < shape[k] or -shape[k] <= ind_i < 0 + ): + raise IndexError( + "Index {0} is out of range for axes " + "{1} with size {2}".format(ind_i, k, shape[k]) + ) + new_advanced_ind.append(ind_i) + k_new = k + 1 + new_shape.extend(shape[k:k_new]) + new_strides.extend(strides[k:k_new]) + k = k_new + else: + ind_i = index(ind_i) + if 0 <= ind_i < shape[k]: + k_new = k + 1 + if not is_empty: + new_offset = new_offset + ind_i * strides[k] + k = k_new + elif -shape[k] <= ind_i < 0: + k_new = k + 1 + if not is_empty: + new_offset = ( + new_offset + (shape[k] + ind_i) * strides[k] + ) + k = k_new + else: + raise IndexError( + "Index {0} is out of range for axes " + "{1} with size {2}".format(ind_i, k, shape[k]) + ) + elif isinstance(ind_i, (ndarray, usm_ndarray)): + if not array_streak: + array_streak = True + if not advanced_start_pos_set: + new_advanced_start_pos = len(new_shape) + advanced_start_pos_set = True + new_advanced_ind.append(ind_i) + dt_k = ind_i.dtype.kind + if dt_k == "b": + k_new = k + ind_i.ndim + else: + k_new = k + 1 + new_shape.extend(shape[k:k_new]) + new_strides.extend(strides[k:k_new]) + k = k_new + new_shape.extend(shape[k:]) + new_strides.extend(strides[k:]) + new_shape_len += len(shape) - k + return ( + tuple(new_shape), + tuple(new_strides), + new_offset, + tuple(new_advanced_ind), + new_advanced_start_pos + ) + else: + raise IndexError( + "Only integers, slices (`:`), ellipsis (`...`), " + "dpnp.tensor.newaxis (`None`) and integer and " + "boolean arrays are valid indices." + ) diff --git a/dpnp/tensor/_sorting.py b/dpnp/tensor/_sorting.py new file mode 100644 index 000000000000..c912b4f77cdf --- /dev/null +++ b/dpnp/tensor/_sorting.py @@ -0,0 +1,441 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import operator +from typing import NamedTuple + +import dpctl.utils as du + +import dpnp.tensor as dpt +import dpnp.tensor._tensor_impl as ti + +from ._numpy_helper import normalize_axis_index +from ._tensor_sorting_impl import ( + _argsort_ascending, + _argsort_descending, + _radix_argsort_ascending, + _radix_argsort_descending, + _radix_sort_ascending, + _radix_sort_descending, + _radix_sort_dtype_supported, + _sort_ascending, + _sort_descending, + _topk, +) + +__all__ = ["sort", "argsort", "top_k"] + + +def _get_mergesort_impl_fn(descending): + return _sort_descending if descending else _sort_ascending + + +def _get_radixsort_impl_fn(descending): + return _radix_sort_descending if descending else _radix_sort_ascending + + +def sort(x, /, *, axis=-1, descending=False, stable=True, kind=None): + """sort(x, axis=-1, descending=False, stable=True) + + Returns a sorted copy of an input array `x`. + + Args: + x (usm_ndarray): + input array. + axis (Optional[int]): + axis along which to sort. If set to `-1`, the function + must sort along the last axis. Default: `-1`. + descending (Optional[bool]): + sort order. If `True`, the array must be sorted in descending + order (by value). If `False`, the array must be sorted in + ascending order (by value). Default: `False`. + stable (Optional[bool]): + sort stability. If `True`, the returned array must maintain the + relative order of `x` values which compare as equal. If `False`, + the returned array may or may not maintain the relative order of + `x` values which compare as equal. Default: `True`. + kind (Optional[Literal["stable", "mergesort", "radixsort"]]): + Sorting algorithm. The default is `"stable"`, which uses parallel + merge-sort or parallel radix-sort algorithms depending on the + array data type. + Returns: + usm_ndarray: + a sorted array. The returned array has the same data type and + the same shape as the input array `x`. + """ + if not isinstance(x, dpt.usm_ndarray): + raise TypeError(f"Expected type dpnp.tensor.usm_ndarray, got {type(x)}") + nd = x.ndim + if nd == 0: + axis = normalize_axis_index(axis, ndim=1, msg_prefix="axis") + return dpt.copy(x, order="C") + else: + axis = normalize_axis_index(axis, ndim=nd, msg_prefix="axis") + a1 = axis + 1 + if a1 == nd: + perm = list(range(nd)) + arr = x + else: + perm = [i for i in range(nd) if i != axis] + [ + axis, + ] + arr = dpt.permute_dims(x, perm) + if kind is None: + kind = "stable" + if not isinstance(kind, str) or kind not in [ + "stable", + "radixsort", + "mergesort", + ]: + raise ValueError( + "Unsupported kind value. Expected 'stable', 'mergesort', " + f"or 'radixsort', but got '{kind}'" + ) + if kind == "mergesort": + impl_fn = _get_mergesort_impl_fn(descending) + elif kind == "radixsort": + if _radix_sort_dtype_supported(x.dtype.num): + impl_fn = _get_radixsort_impl_fn(descending) + else: + raise ValueError(f"Radix sort is not supported for {x.dtype}") + else: + dt = x.dtype + if dt in [dpt.bool, dpt.uint8, dpt.int8, dpt.int16, dpt.uint16]: + impl_fn = _get_radixsort_impl_fn(descending) + else: + impl_fn = _get_mergesort_impl_fn(descending) + exec_q = x.sycl_queue + _manager = du.SequentialOrderManager[exec_q] + dep_evs = _manager.submitted_events + if arr.flags.c_contiguous: + res = dpt.empty_like(arr, order="C") + ht_ev, impl_ev = impl_fn( + src=arr, + trailing_dims_to_sort=1, + dst=res, + sycl_queue=exec_q, + depends=dep_evs, + ) + _manager.add_event_pair(ht_ev, impl_ev) + else: + tmp = dpt.empty_like(arr, order="C") + ht_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=arr, dst=tmp, sycl_queue=exec_q, depends=dep_evs + ) + _manager.add_event_pair(ht_ev, copy_ev) + res = dpt.empty_like(arr, order="C") + ht_ev, impl_ev = impl_fn( + src=tmp, + trailing_dims_to_sort=1, + dst=res, + sycl_queue=exec_q, + depends=[copy_ev], + ) + _manager.add_event_pair(ht_ev, impl_ev) + if a1 != nd: + inv_perm = sorted(range(nd), key=lambda d: perm[d]) + res = dpt.permute_dims(res, inv_perm) + return res + + +def _get_mergeargsort_impl_fn(descending): + return _argsort_descending if descending else _argsort_ascending + + +def _get_radixargsort_impl_fn(descending): + return _radix_argsort_descending if descending else _radix_argsort_ascending + + +def argsort(x, axis=-1, descending=False, stable=True, kind=None): + """argsort(x, axis=-1, descending=False, stable=True) + + Returns the indices that sort an array `x` along a specified axis. + + Args: + x (usm_ndarray): + input array. + axis (Optional[int]): + axis along which to sort. If set to `-1`, the function + must sort along the last axis. Default: `-1`. + descending (Optional[bool]): + sort order. If `True`, the array must be sorted in descending + order (by value). If `False`, the array must be sorted in + ascending order (by value). Default: `False`. + stable (Optional[bool]): + sort stability. If `True`, the returned array must maintain the + relative order of `x` values which compare as equal. If `False`, + the returned array may or may not maintain the relative order of + `x` values which compare as equal. Default: `True`. + kind (Optional[Literal["stable", "mergesort", "radixsort"]]): + Sorting algorithm. The default is `"stable"`, which uses parallel + merge-sort or parallel radix-sort algorithms depending on the + array data type. + + Returns: + usm_ndarray: + an array of indices. The returned array has the same shape as + the input array `x`. The return array has default array index + data type. + """ + if not isinstance(x, dpt.usm_ndarray): + raise TypeError(f"Expected type dpnp.tensor.usm_ndarray, got {type(x)}") + nd = x.ndim + if nd == 0: + axis = normalize_axis_index(axis, ndim=1, msg_prefix="axis") + return dpt.zeros_like( + x, dtype=ti.default_device_index_type(x.sycl_queue), order="C" + ) + else: + axis = normalize_axis_index(axis, ndim=nd, msg_prefix="axis") + a1 = axis + 1 + if a1 == nd: + perm = list(range(nd)) + arr = x + else: + perm = [i for i in range(nd) if i != axis] + [ + axis, + ] + arr = dpt.permute_dims(x, perm) + if kind is None: + kind = "stable" + if not isinstance(kind, str) or kind not in [ + "stable", + "radixsort", + "mergesort", + ]: + raise ValueError( + "Unsupported kind value. Expected 'stable', 'mergesort', " + f"or 'radixsort', but got '{kind}'" + ) + if kind == "mergesort": + impl_fn = _get_mergeargsort_impl_fn(descending) + elif kind == "radixsort": + if _radix_sort_dtype_supported(x.dtype.num): + impl_fn = _get_radixargsort_impl_fn(descending) + else: + raise ValueError(f"Radix sort is not supported for {x.dtype}") + else: + dt = x.dtype + if dt in [dpt.bool, dpt.uint8, dpt.int8, dpt.int16, dpt.uint16]: + impl_fn = _get_radixargsort_impl_fn(descending) + else: + impl_fn = _get_mergeargsort_impl_fn(descending) + exec_q = x.sycl_queue + _manager = du.SequentialOrderManager[exec_q] + dep_evs = _manager.submitted_events + index_dt = ti.default_device_index_type(exec_q) + if arr.flags.c_contiguous: + res = dpt.empty_like(arr, dtype=index_dt, order="C") + ht_ev, impl_ev = impl_fn( + src=arr, + trailing_dims_to_sort=1, + dst=res, + sycl_queue=exec_q, + depends=dep_evs, + ) + _manager.add_event_pair(ht_ev, impl_ev) + else: + tmp = dpt.empty_like(arr, order="C") + ht_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=arr, dst=tmp, sycl_queue=exec_q, depends=dep_evs + ) + _manager.add_event_pair(ht_ev, copy_ev) + res = dpt.empty_like(arr, dtype=index_dt, order="C") + ht_ev, impl_ev = impl_fn( + src=tmp, + trailing_dims_to_sort=1, + dst=res, + sycl_queue=exec_q, + depends=[copy_ev], + ) + _manager.add_event_pair(ht_ev, impl_ev) + if a1 != nd: + inv_perm = sorted(range(nd), key=lambda d: perm[d]) + res = dpt.permute_dims(res, inv_perm) + return res + + +def _get_top_k_largest(mode): + modes = {"largest": True, "smallest": False} + try: + return modes[mode] + except KeyError: + raise ValueError( + f"`mode` must be `largest` or `smallest`. Got `{mode}`." + ) + + +class TopKResult(NamedTuple): + values: dpt.usm_ndarray + indices: dpt.usm_ndarray + + +def top_k(x, k, /, *, axis=None, mode="largest"): + """top_k(x, k, axis=None, mode="largest") + + Returns the `k` largest or smallest values and their indices in the input + array `x` along the specified axis `axis`. + + Args: + x (usm_ndarray): + input array. + k (int): + number of elements to find. Must be a positive integer value. + axis (Optional[int]): + axis along which to search. If `None`, the search will be performed + over the flattened array. Default: ``None``. + mode (Literal["largest", "smallest"]): + search mode. Must be one of the following modes: + + - `"largest"`: return the `k` largest elements. + - `"smallest"`: return the `k` smallest elements. + + Default: `"largest"`. + + Returns: + tuple[usm_ndarray, usm_ndarray] + a namedtuple `(values, indices)` whose + + * first element `values` will be an array containing the `k` + largest or smallest elements of `x`. The array has the same data + type as `x`. If `axis` was `None`, `values` will be a + one-dimensional array with shape `(k,)` and otherwise, `values` + will have shape `x.shape[:axis] + (k,) + x.shape[axis+1:]` + * second element `indices` will be an array containing indices of + `x` that result in `values`. The array will have the same shape + as `values` and will have the default array index data type. + """ + largest = _get_top_k_largest(mode) + if not isinstance(x, dpt.usm_ndarray): + raise TypeError(f"Expected type dpnp.tensor.usm_ndarray, got {type(x)}") + + k = operator.index(k) + if k < 0: + raise ValueError("`k` must be a positive integer value") + + nd = x.ndim + if axis is None: + sz = x.size + if nd == 0: + if k > 1: + raise ValueError(f"`k`={k} is out of bounds 1") + return TopKResult( + dpt.copy(x, order="C"), + dpt.zeros_like( + x, dtype=ti.default_device_index_type(x.sycl_queue) + ), + ) + arr = x + n_search_dims = None + res_sh = k + else: + axis = normalize_axis_index(axis, ndim=nd, msg_prefix="axis") + sz = x.shape[axis] + a1 = axis + 1 + if a1 == nd: + perm = list(range(nd)) + arr = x + else: + perm = [i for i in range(nd) if i != axis] + [ + axis, + ] + arr = dpt.permute_dims(x, perm) + n_search_dims = 1 + res_sh = arr.shape[: nd - 1] + (k,) + + if k > sz: + raise ValueError(f"`k`={k} is out of bounds {sz}") + + exec_q = x.sycl_queue + _manager = du.SequentialOrderManager[exec_q] + dep_evs = _manager.submitted_events + + res_usm_type = arr.usm_type + if arr.flags.c_contiguous: + vals = dpt.empty( + res_sh, + dtype=arr.dtype, + usm_type=res_usm_type, + order="C", + sycl_queue=exec_q, + ) + inds = dpt.empty( + res_sh, + dtype=ti.default_device_index_type(exec_q), + usm_type=res_usm_type, + order="C", + sycl_queue=exec_q, + ) + ht_ev, impl_ev = _topk( + src=arr, + trailing_dims_to_search=n_search_dims, + k=k, + largest=largest, + vals=vals, + inds=inds, + sycl_queue=exec_q, + depends=dep_evs, + ) + _manager.add_event_pair(ht_ev, impl_ev) + else: + tmp = dpt.empty_like(arr, order="C") + ht_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=arr, dst=tmp, sycl_queue=exec_q, depends=dep_evs + ) + _manager.add_event_pair(ht_ev, copy_ev) + vals = dpt.empty( + res_sh, + dtype=arr.dtype, + usm_type=res_usm_type, + order="C", + sycl_queue=exec_q, + ) + inds = dpt.empty( + res_sh, + dtype=ti.default_device_index_type(exec_q), + usm_type=res_usm_type, + order="C", + sycl_queue=exec_q, + ) + ht_ev, impl_ev = _topk( + src=tmp, + trailing_dims_to_search=n_search_dims, + k=k, + largest=largest, + vals=vals, + inds=inds, + sycl_queue=exec_q, + depends=[copy_ev], + ) + _manager.add_event_pair(ht_ev, impl_ev) + if axis is not None and a1 != nd: + inv_perm = sorted(range(nd), key=lambda d: perm[d]) + vals = dpt.permute_dims(vals, inv_perm) + inds = dpt.permute_dims(inds, inv_perm) + + return TopKResult(vals, inds) diff --git a/dpnp/tensor/_statistical_functions.py b/dpnp/tensor/_statistical_functions.py new file mode 100644 index 000000000000..a2015488aff2 --- /dev/null +++ b/dpnp/tensor/_statistical_functions.py @@ -0,0 +1,379 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. + +import dpctl.utils as du + +import dpnp.tensor as dpt +import dpnp.tensor._tensor_elementwise_impl as tei +import dpnp.tensor._tensor_impl as ti +import dpnp.tensor._tensor_reductions_impl as tri + +from ._numpy_helper import normalize_axis_tuple + + +def _var_impl(x, axis, correction, keepdims): + nd = x.ndim + if axis is None: + axis = tuple(range(nd)) + if not isinstance(axis, (tuple, list)): + axis = (axis,) + axis = normalize_axis_tuple(axis, nd, "axis") + perm = [] + nelems = 1 + for i in range(nd): + if i not in axis: + perm.append(i) + else: + nelems *= x.shape[i] + red_nd = len(axis) + perm = perm + list(axis) + q = x.sycl_queue + inp_dt = x.dtype + res_dt = ( + inp_dt + if inp_dt.kind == "f" + else dpt.dtype(ti.default_device_fp_type(q)) + ) + res_usm_type = x.usm_type + + _manager = du.SequentialOrderManager[q] + dep_evs = _manager.submitted_events + if inp_dt != res_dt: + buf = dpt.empty_like(x, dtype=res_dt) + ht_e_buf, c_e1 = ti._copy_usm_ndarray_into_usm_ndarray( + src=x, dst=buf, sycl_queue=q, depends=dep_evs + ) + _manager.add_event_pair(ht_e_buf, c_e1) + else: + buf = x + # calculate mean + buf2 = dpt.permute_dims(buf, perm) + res_shape = buf2.shape[: nd - red_nd] + # use keepdims=True path for later broadcasting + if red_nd == 0: + mean_ary = dpt.empty_like(buf) + dep_evs = _manager.submitted_events + ht_e1, c_e2 = ti._copy_usm_ndarray_into_usm_ndarray( + src=buf, dst=mean_ary, sycl_queue=q, depends=dep_evs + ) + _manager.add_event_pair(ht_e1, c_e2) + else: + mean_ary = dpt.empty( + res_shape, + dtype=res_dt, + usm_type=res_usm_type, + sycl_queue=q, + ) + dep_evs = _manager.submitted_events + ht_e1, r_e1 = tri._sum_over_axis( + src=buf2, + trailing_dims_to_reduce=red_nd, + dst=mean_ary, + sycl_queue=q, + depends=dep_evs, + ) + _manager.add_event_pair(ht_e1, r_e1) + + mean_ary_shape = res_shape + (1,) * red_nd + inv_perm = sorted(range(nd), key=lambda d: perm[d]) + mean_ary = dpt.permute_dims( + dpt.reshape(mean_ary, mean_ary_shape), inv_perm + ) + # divide in-place to get mean + mean_ary_shape = mean_ary.shape + + dep_evs = _manager.submitted_events + ht_e2, d_e1 = tei._divide_by_scalar( + src=mean_ary, scalar=nelems, dst=mean_ary, sycl_queue=q, depends=dep_evs + ) + _manager.add_event_pair(ht_e2, d_e1) + + # subtract mean from original array to get deviations + dev_ary = dpt.empty_like(buf) + if mean_ary_shape != buf.shape: + mean_ary = dpt.broadcast_to(mean_ary, buf.shape) + ht_e4, su_e = tei._subtract( + src1=buf, src2=mean_ary, dst=dev_ary, sycl_queue=q, depends=[d_e1] + ) + _manager.add_event_pair(ht_e4, su_e) + # square deviations + ht_e5, sq_e = tei._square( + src=dev_ary, dst=dev_ary, sycl_queue=q, depends=[su_e] + ) + _manager.add_event_pair(ht_e5, sq_e) + + # take sum of squared deviations + dev_ary2 = dpt.permute_dims(dev_ary, perm) + if red_nd == 0: + res = dev_ary + else: + res = dpt.empty( + res_shape, + dtype=res_dt, + usm_type=res_usm_type, + sycl_queue=q, + ) + ht_e6, r_e2 = tri._sum_over_axis( + src=dev_ary2, + trailing_dims_to_reduce=red_nd, + dst=res, + sycl_queue=q, + depends=[sq_e], + ) + _manager.add_event_pair(ht_e6, r_e2) + + if keepdims: + res_shape = res_shape + (1,) * red_nd + inv_perm = sorted(range(nd), key=lambda d: perm[d]) + res = dpt.permute_dims(dpt.reshape(res, res_shape), inv_perm) + res_shape = res.shape + # when nelems - correction <= 0, yield nans + div = max(nelems - correction, 0) + if not div: + div = dpt.nan + dep_evs = _manager.submitted_events + ht_e7, d_e2 = tei._divide_by_scalar( + src=res, scalar=div, dst=res, sycl_queue=q, depends=dep_evs + ) + _manager.add_event_pair(ht_e7, d_e2) + return res, [d_e2] + + +def mean(x, axis=None, keepdims=False): + """mean(x, axis=None, keepdims=False) + + Calculates the arithmetic mean of elements in the input array `x`. + + Args: + x (usm_ndarray): + input array. + axis (Optional[int, Tuple[int, ...]]): + axis or axes along which the arithmetic means must be computed. If + a tuple of unique integers, the means are computed over multiple + axes. If `None`, the mean is computed over the entire array. + Default: `None`. + keepdims (Optional[bool]): + if `True`, the reduced axes (dimensions) are included in the result + as singleton dimensions, so that the returned array remains + compatible with the input array according to Array Broadcasting + rules. Otherwise, if `False`, the reduced axes are not included in + the returned array. Default: `False`. + Returns: + usm_ndarray: + an array containing the arithmetic means. If the mean was computed + over the entire array, a zero-dimensional array is returned. + + If `x` has a floating-point data type, the returned array will have + the same data type as `x`. + If `x` has a boolean or integral data type, the returned array + will have the default floating point data type for the device + where input array `x` is allocated. + """ + if not isinstance(x, dpt.usm_ndarray): + raise TypeError(f"Expected dpnp.tensor.usm_ndarray, got {type(x)}") + nd = x.ndim + if axis is None: + axis = tuple(range(nd)) + if not isinstance(axis, (tuple, list)): + axis = (axis,) + axis = normalize_axis_tuple(axis, nd, "axis") + perm = [] + nelems = 1 + for i in range(nd): + if i not in axis: + perm.append(i) + else: + nelems *= x.shape[i] + sum_nd = len(axis) + perm = perm + list(axis) + arr2 = dpt.permute_dims(x, perm) + res_shape = arr2.shape[: nd - sum_nd] + q = x.sycl_queue + inp_dt = x.dtype + res_dt = ( + x.dtype + if x.dtype.kind in "fc" + else dpt.dtype(ti.default_device_fp_type(q)) + ) + res_usm_type = x.usm_type + if sum_nd == 0: + return dpt.astype(x, res_dt, copy=True) + + _manager = du.SequentialOrderManager[q] + dep_evs = _manager.submitted_events + if tri._sum_over_axis_dtype_supported(inp_dt, res_dt, res_usm_type, q): + res = dpt.empty( + res_shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=q + ) + ht_e1, r_e = tri._sum_over_axis( + src=arr2, + trailing_dims_to_reduce=sum_nd, + dst=res, + sycl_queue=q, + depends=dep_evs, + ) + _manager.add_event_pair(ht_e1, r_e) + else: + tmp = dpt.empty( + arr2.shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=q + ) + ht_e_cpy, cpy_e = ti._copy_usm_ndarray_into_usm_ndarray( + src=arr2, dst=tmp, sycl_queue=q, depends=dep_evs + ) + _manager.add_event_pair(ht_e_cpy, cpy_e) + res = dpt.empty( + res_shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=q + ) + ht_e_red, r_e = tri._sum_over_axis( + src=tmp, + trailing_dims_to_reduce=sum_nd, + dst=res, + sycl_queue=q, + depends=[cpy_e], + ) + _manager.add_event_pair(ht_e_red, r_e) + + if keepdims: + res_shape = res_shape + (1,) * sum_nd + inv_perm = sorted(range(nd), key=lambda d: perm[d]) + res = dpt.permute_dims(dpt.reshape(res, res_shape), inv_perm) + + dep_evs = _manager.submitted_events + ht_e2, div_e = tei._divide_by_scalar( + src=res, scalar=nelems, dst=res, sycl_queue=q, depends=dep_evs + ) + _manager.add_event_pair(ht_e2, div_e) + return res + + +def var(x, axis=None, correction=0.0, keepdims=False): + """var(x, axis=None, correction=0.0, keepdims=False) + + Calculates the variance of elements in the input array `x`. + + Args: + x (usm_ndarray): + input array. + axis (Optional[int, Tuple[int, ...]]): + axis or axes along which the variances must be computed. If a tuple + of unique integers, the variances are computed over multiple axes. + If `None`, the variance is computed over the entire array. + Default: `None`. + correction (Optional[float, int]): + degrees of freedom adjustment. The divisor used in calculating the + variance is `N - correction`, where `N` corresponds to the total + number of elements over which the variance is calculated. + Default: `0.0`. + keepdims (Optional[bool]): + if `True`, the reduced axes (dimensions) are included in the result + as singleton dimensions, so that the returned array remains + compatible with the input array according to Array Broadcasting + rules. Otherwise, if `False`, the reduced axes are not included in + the returned array. Default: `False`. + Returns: + usm_ndarray: + an array containing the variances. If the variance was computed + over the entire array, a zero-dimensional array is returned. + + If `x` has a real-valued floating-point data type, the returned + array will have the same data type as `x`. + If `x` has a boolean or integral data type, the returned array + will have the default floating point data type for the device + where input array `x` is allocated. + """ + if not isinstance(x, dpt.usm_ndarray): + raise TypeError(f"Expected dpnp.tensor.usm_ndarray, got {type(x)}") + + if not isinstance(correction, (int, float)): + raise TypeError( + "Expected a Python integer or float for `correction`, got" + f"{type(x)}" + ) + + if x.dtype.kind == "c": + raise ValueError("`var` does not support complex types") + + res, _ = _var_impl(x, axis, correction, keepdims) + return res + + +def std(x, axis=None, correction=0.0, keepdims=False): + """std(x, axis=None, correction=0.0, keepdims=False) + + Calculates the standard deviation of elements in the input array `x`. + + Args: + x (usm_ndarray): + input array. + axis (Optional[int, Tuple[int, ...]]): + axis or axes along which the standard deviations must be computed. + If a tuple of unique integers, the standard deviations are computed + over multiple axes. If `None`, the standard deviation is computed + over the entire array. Default: `None`. + correction (Optional[float, int]): + degrees of freedom adjustment. The divisor used in calculating the + standard deviation is `N - correction`, where `N` corresponds to the + total number of elements over which the standard deviation is + calculated. Default: `0.0`. + keepdims (Optional[bool]): + if `True`, the reduced axes (dimensions) are included in the result + as singleton dimensions, so that the returned array remains + compatible with the input array according to Array Broadcasting + rules. Otherwise, if `False`, the reduced axes are not included in + the returned array. Default: `False`. + Returns: + usm_ndarray: + an array containing the standard deviations. If the standard + deviation was computed over the entire array, a zero-dimensional + array is returned. + + If `x` has a real-valued floating-point data type, the returned + array will have the same data type as `x`. + If `x` has a boolean or integral data type, the returned array + will have the default floating point data type for the device + where input array `x` is allocated. + """ + if not isinstance(x, dpt.usm_ndarray): + raise TypeError(f"Expected dpnp.tensor.usm_ndarray, got {type(x)}") + + if not isinstance(correction, (int, float)): + raise TypeError( + "Expected a Python integer or float for `correction`," + f"got {type(x)}" + ) + + if x.dtype.kind == "c": + raise ValueError("`std` does not support complex types") + + exec_q = x.sycl_queue + _manager = du.SequentialOrderManager[exec_q] + res, deps = _var_impl(x, axis, correction, keepdims) + ht_ev, sqrt_ev = tei._sqrt( + src=res, dst=res, sycl_queue=exec_q, depends=deps + ) + _manager.add_event_pair(ht_ev, sqrt_ev) + return res diff --git a/dpnp/tensor/_stride_utils.pxi b/dpnp/tensor/_stride_utils.pxi new file mode 100644 index 000000000000..3caf8dd8fd1f --- /dev/null +++ b/dpnp/tensor/_stride_utils.pxi @@ -0,0 +1,314 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +# distutils: language = c++ +# cython: language_level=3 + +from cpython.mem cimport PyMem_Malloc +from cpython.ref cimport Py_INCREF +from cpython.tuple cimport PyTuple_New, PyTuple_SetItem + + +cdef int ERROR_MALLOC = 1 +cdef int ERROR_INTERNAL = -1 +cdef int ERROR_INCORRECT_ORDER = 2 +cdef int ERROR_UNEXPECTED_STRIDES = 3 + +cdef int USM_ARRAY_C_CONTIGUOUS = 1 +cdef int USM_ARRAY_F_CONTIGUOUS = 2 +cdef int USM_ARRAY_WRITABLE = 4 + + +cdef Py_ssize_t shape_to_elem_count(int nd, Py_ssize_t *shape_arr): + """ + Computes number of elements in an array. + """ + cdef Py_ssize_t count = 1 + for i in range(nd): + count *= shape_arr[i] + return count + + +cdef int _from_input_shape_strides( + int nd, object shape, object strides, int itemsize, char order, + Py_ssize_t **shape_ptr, Py_ssize_t **strides_ptr, + Py_ssize_t *nelems, Py_ssize_t *min_disp, Py_ssize_t *max_disp, + int *contig +): + """ + Arguments: nd, shape, strides, itemsize, order + Modifies: + shape_ptr - pointer to C array for shape values + stride_ptr - pointer to C array for strides values + nelems - Number of elements in array + min_disp = min( dot(strides, index), index for shape) + max_disp = max( dor(strides, index), index for shape) + contig = enumeration for array contiguity + Returns: 0 on success, error code otherwise. + On success pointers point to allocated arrays, + Otherwise they are set to NULL + """ + cdef int i + cdef int j + cdef bint all_incr = 1 + cdef bint all_decr = 1 + cdef bint strides_inspected = 0 + cdef Py_ssize_t elem_count = 1 + cdef Py_ssize_t min_shift = 0 + cdef Py_ssize_t max_shift = 0 + cdef Py_ssize_t str_i + cdef Py_ssize_t* shape_arr + cdef Py_ssize_t* strides_arr + + if (int(order) not in [ord("C"), ord("F"), ord("c"), ord("f")]): + return ERROR_INCORRECT_ORDER + + # 0-d array + if (nd == 0): + contig[0] = (USM_ARRAY_C_CONTIGUOUS | USM_ARRAY_F_CONTIGUOUS) + nelems[0] = 1 + min_disp[0] = 0 + max_disp[0] = 0 + shape_ptr[0] = (0) + strides_ptr[0] = (0) + return 0 + + shape_arr = PyMem_Malloc(nd * sizeof(Py_ssize_t)) + if (not shape_arr): + return ERROR_MALLOC + shape_ptr[0] = shape_arr + for i in range(0, nd): + shape_arr[i] = shape[i] + elem_count *= shape_arr[i] + if elem_count == 0: + contig[0] = (USM_ARRAY_C_CONTIGUOUS | USM_ARRAY_F_CONTIGUOUS) + nelems[0] = 1 + min_disp[0] = 0 + max_disp[0] = 0 + if strides is None: + strides_ptr[0] = (0) + else: + strides_arr = PyMem_Malloc(nd * sizeof(Py_ssize_t)) + if (not strides_arr): + PyMem_Free(shape_ptr[0]) + shape_ptr[0] = (0) + return ERROR_MALLOC + strides_ptr[0] = strides_arr + for i in range(0, nd): + strides_arr[i] = strides[i] + return 0 + nelems[0] = elem_count + if (strides is None): + # no need to allocate and populate strides + if order == ord("C") or order == ord("c"): + contig[0] = USM_ARRAY_C_CONTIGUOUS + else: + contig[0] = USM_ARRAY_F_CONTIGUOUS + if nd == 1: + contig[0] = USM_ARRAY_C_CONTIGUOUS | USM_ARRAY_F_CONTIGUOUS + else: + j = 0 + for i in range(nd): + if shape_arr[i] > 1: + j = j + 1 + if j < 2: + contig[0] = USM_ARRAY_C_CONTIGUOUS | USM_ARRAY_F_CONTIGUOUS + min_disp[0] = 0 + max_disp[0] = (elem_count - 1) + strides_ptr[0] = (0) + return 0 + elif ((isinstance(strides, (list, tuple)) or hasattr(strides, "tolist")) + and len(strides) == nd): + strides_arr = PyMem_Malloc(nd * sizeof(Py_ssize_t)) + if (not strides_arr): + PyMem_Free(shape_ptr[0]) + shape_ptr[0] = (0) + return ERROR_MALLOC + strides_ptr[0] = strides_arr + for i in range(0, nd): + str_i = strides[i] + strides_arr[i] = str_i + if str_i > 0: + max_shift += str_i * (shape_arr[i] - 1) + else: + min_shift += str_i * (shape_arr[i] - 1) + min_disp[0] = min_shift + max_disp[0] = max_shift + if max_shift == min_shift + (elem_count - 1): + if elem_count == 1: + contig[0] = (USM_ARRAY_C_CONTIGUOUS | USM_ARRAY_F_CONTIGUOUS) + return 0 + if nd == 1: + if strides_arr[0] == 1: + contig[0] = USM_ARRAY_C_CONTIGUOUS | USM_ARRAY_F_CONTIGUOUS + else: + contig[0] = 0 + return 0 + i = 0 + while i < nd: + if shape_arr[i] == 1: + i = i + 1 + continue + j = i + 1 + while (j < nd and shape_arr[j] == 1): + j = j + 1 + if j < nd: + strides_inspected = 1 + if all_incr: + all_incr = ( + (strides_arr[i] > 0) and + (strides_arr[j] > 0) and + (strides_arr[i] <= strides_arr[j]) + ) + if all_decr: + all_decr = ( + (strides_arr[i] > 0) and + (strides_arr[j] > 0) and + (strides_arr[i] >= strides_arr[j]) + ) + i = j + else: + if not strides_inspected: + # all dimensions have size 1 except + # dimension 'i'. Array is both C and F + # contiguous + strides_inspected = 1 + all_incr = (strides_arr[i] == 1) + all_decr = all_incr + break + # should only set contig flags on actually obtained + # values, rather than default values + all_incr = all_incr and strides_inspected + all_decr = all_decr and strides_inspected + if all_incr and all_decr: + contig[0] = (USM_ARRAY_C_CONTIGUOUS | USM_ARRAY_F_CONTIGUOUS) + elif all_incr: + contig[0] = USM_ARRAY_F_CONTIGUOUS + elif all_decr: + contig[0] = USM_ARRAY_C_CONTIGUOUS + else: + contig[0] = 0 + return 0 + else: + contig[0] = 0 # non-contiguous + return 0 + else: + PyMem_Free(shape_ptr[0]) + shape_ptr[0] = (0) + return ERROR_UNEXPECTED_STRIDES + # return ERROR_INTERNAL + + +cdef object _make_int_tuple(int nd, const Py_ssize_t *ary): + """ + Makes Python tuple from C array + """ + cdef tuple res + cdef object tmp + if (ary): + res = PyTuple_New(nd) + for i in range(nd): + tmp = ary[i] + Py_INCREF(tmp) # SetItem steals the reference + PyTuple_SetItem(res, i, tmp) + return res + else: + return None + + +cdef object _make_reversed_int_tuple(int nd, const Py_ssize_t *ary): + """ + Makes Python reversed tuple from C array + """ + cdef tuple res + cdef object tmp + cdef int i + cdef int nd_1 + if (ary): + res = PyTuple_New(nd) + nd_1 = nd - 1 + for i in range(nd): + tmp = ary[i] + Py_INCREF(tmp) # SetItem steals the reference + PyTuple_SetItem(res, nd_1 - i, tmp) + return res + else: + return None + + +cdef object _c_contig_strides(int nd, Py_ssize_t *shape): + """ + Makes Python tuple for strides of C-contiguous array + """ + cdef tuple cc_strides = PyTuple_New(nd) + cdef object si = 1 + cdef int i + cdef int nd_1 = nd - 1 + for i in range(0, nd): + Py_INCREF(si) # SetItem steals the reference + PyTuple_SetItem(cc_strides, nd_1 - i, si) + si = si * shape[nd_1 - i] + return cc_strides + + +cdef object _f_contig_strides(int nd, Py_ssize_t *shape): + """ + Makes Python tuple for strides of F-contiguous array + """ + cdef tuple fc_strides = PyTuple_New(nd) + cdef object si = 1 + for i in range(0, nd): + Py_INCREF(si) # SetItem steals the reference + PyTuple_SetItem(fc_strides, i, si) + si = si * shape[i] + return fc_strides + +cdef object _swap_last_two(tuple t): + """ + Swap last two elements of a tuple + """ + cdef int nd = len(t) + cdef tuple res + cdef int i + cdef object tmp + if (nd < 2): + return t + res = PyTuple_New(nd) + # copy all elements except the last two + for i in range(0, nd-2): + tmp = t[i] + Py_INCREF(tmp) # SetItem steals the reference + PyTuple_SetItem(res, i, tmp) + # swap the last two elements + tmp = t[nd-1] + Py_INCREF(tmp) # SetItem steals + PyTuple_SetItem(res, nd - 2, tmp) + tmp = t[nd-2] + Py_INCREF(tmp) # SetItem steals + PyTuple_SetItem(res, nd - 1, tmp) + return res diff --git a/dpnp/tensor/_testing.py b/dpnp/tensor/_testing.py new file mode 100644 index 000000000000..fbec13fdeb36 --- /dev/null +++ b/dpnp/tensor/_testing.py @@ -0,0 +1,163 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import numpy as np + +import dpnp.tensor as dpt + +from ._manipulation_functions import _broadcast_shape_impl +from ._type_utils import _to_device_supported_dtype + + +def _allclose_complex_fp(z1, z2, atol, rtol, equal_nan): + z1r = dpt.real(z1) + z1i = dpt.imag(z1) + z2r = dpt.real(z2) + z2i = dpt.imag(z2) + if equal_nan: + check1 = dpt.all(dpt.isnan(z1r) == dpt.isnan(z2r)) and dpt.all( + dpt.isnan(z1i) == dpt.isnan(z2i) + ) + else: + check1 = ( + dpt.logical_not(dpt.any(dpt.isnan(z1r))) + and dpt.logical_not(dpt.any(dpt.isnan(z1i))) + ) and ( + dpt.logical_not(dpt.any(dpt.isnan(z2r))) + and dpt.logical_not(dpt.any(dpt.isnan(z2i))) + ) + if not check1: + return check1 + mr = dpt.isinf(z1r) + mi = dpt.isinf(z1i) + check2 = dpt.all(mr == dpt.isinf(z2r)) and dpt.all(mi == dpt.isinf(z2i)) + if not check2: + return check2 + check3 = dpt.all(z1r[mr] == z2r[mr]) and dpt.all(z1i[mi] == z2i[mi]) + if not check3: + return check3 + mr = dpt.isfinite(z1r) + mi = dpt.isfinite(z1i) + mv1 = z1r[mr] + mv2 = z2r[mr] + check4 = dpt.all( + dpt.abs(mv1 - mv2) + < dpt.maximum(atol, rtol * dpt.maximum(dpt.abs(mv1), dpt.abs(mv2))) + ) + if not check4: + return check4 + mv1 = z1i[mi] + mv2 = z2i[mi] + check5 = dpt.all( + dpt.abs(mv1 - mv2) + <= dpt.maximum(atol, rtol * dpt.maximum(dpt.abs(mv1), dpt.abs(mv2))) + ) + return check5 + + +def _allclose_real_fp(r1, r2, atol, rtol, equal_nan): + if equal_nan: + check1 = dpt.all(dpt.isnan(r1) == dpt.isnan(r2)) + else: + check1 = dpt.logical_not(dpt.any(dpt.isnan(r1))) and dpt.logical_not( + dpt.any(dpt.isnan(r2)) + ) + if not check1: + return check1 + mr = dpt.isinf(r1) + check2 = dpt.all(mr == dpt.isinf(r2)) + if not check2: + return check2 + check3 = dpt.all(r1[mr] == r2[mr]) + if not check3: + return check3 + m = dpt.isfinite(r1) + mv1 = r1[m] + mv2 = r2[m] + check4 = dpt.all( + dpt.abs(mv1 - mv2) + <= dpt.maximum(atol, rtol * dpt.maximum(dpt.abs(mv1), dpt.abs(mv2))) + ) + return check4 + + +def _allclose_others(r1, r2): + return dpt.all(r1 == r2) + + +def allclose(a1, a2, atol=1e-8, rtol=1e-5, equal_nan=False): + """allclose(a1, a2, atol=1e-8, rtol=1e-5, equal_nan=False) + + Returns True if two arrays are element-wise equal within tolerances. + + The testing is based on the following elementwise comparison: + + abs(a - b) <= max(atol, rtol * max(abs(a), abs(b))) + """ + if not isinstance(a1, dpt.usm_ndarray): + raise TypeError( + f"Expected dpnp.tensor.usm_ndarray type, got {type(a1)}." + ) + if not isinstance(a2, dpt.usm_ndarray): + raise TypeError( + f"Expected dpnp.tensor.usm_ndarray type, got {type(a2)}." + ) + atol = float(atol) + rtol = float(rtol) + if atol < 0.0 or rtol < 0.0: + raise ValueError( + "Absolute and relative tolerances must be non-negative" + ) + equal_nan = bool(equal_nan) + exec_q = dpt.get_execution_queue(tuple(a.sycl_queue for a in (a1, a2))) + if exec_q is None: + raise dpt.ExecutionPlacementError( + "Execution placement can not be unambiguously inferred " + "from input arguments." + ) + res_sh = _broadcast_shape_impl([a1.shape, a2.shape]) + b1 = a1 + b2 = a2 + if b1.dtype == b2.dtype: + res_dt = b1.dtype + else: + res_dt = np.promote_types(b1.dtype, b2.dtype) + res_dt = _to_device_supported_dtype(res_dt, exec_q.sycl_device) + b1 = dpt.astype(b1, res_dt) + b2 = dpt.astype(b2, res_dt) + + b1 = dpt.broadcast_to(b1, res_sh) + b2 = dpt.broadcast_to(b2, res_sh) + + k = b1.dtype.kind + if k == "c": + return _allclose_complex_fp(b1, b2, atol, rtol, equal_nan) + elif k == "f": + return _allclose_real_fp(b1, b2, atol, rtol, equal_nan) + else: + return _allclose_others(b1, b2) diff --git a/dpnp/tensor/_type_utils.py b/dpnp/tensor/_type_utils.py new file mode 100644 index 000000000000..b03ca1e1c79d --- /dev/null +++ b/dpnp/tensor/_type_utils.py @@ -0,0 +1,1004 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +from __future__ import annotations + +import numpy as np + +import dpnp.tensor as dpt +import dpnp.tensor._tensor_impl as ti + + +def _all_data_types(_fp16, _fp64): + _non_fp_types = [ + dpt.bool, + dpt.int8, + dpt.uint8, + dpt.int16, + dpt.uint16, + dpt.int32, + dpt.uint32, + dpt.int64, + dpt.uint64, + ] + if _fp64: + if _fp16: + return _non_fp_types + [ + dpt.float16, + dpt.float32, + dpt.float64, + dpt.complex64, + dpt.complex128, + ] + else: + return _non_fp_types + [ + dpt.float32, + dpt.float64, + dpt.complex64, + dpt.complex128, + ] + else: + if _fp16: + return _non_fp_types + [ + dpt.float16, + dpt.float32, + dpt.complex64, + ] + else: + return _non_fp_types + [ + dpt.float32, + dpt.complex64, + ] + + +def _acceptance_fn_default_binary( + arg1_dtype, arg2_dtype, ret_buf1_dt, ret_buf2_dt, res_dt, sycl_dev +): + return True + + +def _acceptance_fn_default_unary(arg_dtype, ret_buf_dt, res_dt, sycl_dev): + return True + + +def _acceptance_fn_divide( + arg1_dtype, arg2_dtype, ret_buf1_dt, ret_buf2_dt, res_dt, sycl_dev +): + # both are being promoted, if the kind of result is + # different than the kind of original input dtypes, + # we use default dtype for the resulting kind. + # This covers, e.g. (array_dtype_i1 / array_dtype_u1) + # result of which in divide is double (in NumPy), but + # regular type promotion rules peg at float16 + if (ret_buf1_dt.kind != arg1_dtype.kind) and ( + ret_buf2_dt.kind != arg2_dtype.kind + ): + default_dt = _get_device_default_dtype(res_dt.kind, sycl_dev) + if res_dt == default_dt: + return True + else: + return False + else: + return True + + +def _acceptance_fn_negative(arg_dtype, buf_dt, res_dt, sycl_dev): + # negative is not defined for boolean data type + if arg_dtype.char == "?": + raise ValueError( + "The `negative` function, the `-` operator, is not supported " + "for inputs of data type bool, use the `~` operator or the " + "`logical_not` function instead" + ) + else: + return True + + +def _acceptance_fn_reciprocal(arg_dtype, buf_dt, res_dt, sycl_dev): + # if the kind of result is different from the kind of input, we use the + # default floating-point dtype for the resulting kind. This guarantees + # alignment of reciprocal and divide output types. + if buf_dt.kind != arg_dtype.kind: + default_dt = _get_device_default_dtype(res_dt.kind, sycl_dev) + if res_dt == default_dt: + return True + else: + return False + else: + return True + + +def _acceptance_fn_round(arg_dtype, buf_dt, res_dt, sycl_dev): + # for boolean input, prefer floating-point output over integral + if arg_dtype.kind == "b" and res_dt.kind != "f": + return False + return True + + +def _acceptance_fn_subtract( + arg1_dtype, arg2_dtype, buf1_dt, buf2_dt, res_dt, sycl_dev +): + # subtract is not defined for boolean data type + if arg1_dtype.char == "?" and arg2_dtype.char == "?": + raise ValueError( + "The `subtract` function, the `-` operator, is not supported " + "for inputs of data type bool, use the `^` operator, the " + "`bitwise_xor`, or the `logical_xor` function instead" + ) + else: + return True + + +def _can_cast( + from_: dpt.dtype, to_: dpt.dtype, _fp16: bool, _fp64: bool, casting="safe" +) -> bool: + """ + Can `from_` be cast to `to_` safely on a device with + fp16 and fp64 aspects as given? + """ + if not _dtype_supported_by_device_impl(to_, _fp16, _fp64): + return False + can_cast_v = np.can_cast(from_, to_, casting=casting) # ask NumPy + if _fp16 and _fp64: + return can_cast_v + if not can_cast_v: + if ( + from_.kind in "biu" + and to_.kind in "fc" + and _is_maximal_inexact_type(to_, _fp16, _fp64) + ): + return True + + return can_cast_v + + +def _dtype_supported_by_device_impl( + dt: dpt.dtype, has_fp16: bool, has_fp64: bool +) -> bool: + if has_fp64: + if not has_fp16: + if dt is dpt.float16: + return False + else: + if dt is dpt.float64: + return False + elif dt is dpt.complex128: + return False + if not has_fp16 and dt is dpt.float16: + return False + return True + + +def _find_buf_dtype(arg_dtype, query_fn, sycl_dev, acceptance_fn): + res_dt = query_fn(arg_dtype) + if res_dt: + return None, res_dt + + _fp16 = sycl_dev.has_aspect_fp16 + _fp64 = sycl_dev.has_aspect_fp64 + all_dts = _all_data_types(_fp16, _fp64) + for buf_dt in all_dts: + if _can_cast(arg_dtype, buf_dt, _fp16, _fp64): + res_dt = query_fn(buf_dt) + if res_dt: + acceptable = acceptance_fn(arg_dtype, buf_dt, res_dt, sycl_dev) + if acceptable: + return buf_dt, res_dt + else: + continue + + return None, None + + +def _find_buf_dtype2(arg1_dtype, arg2_dtype, query_fn, sycl_dev, acceptance_fn): + res_dt = query_fn(arg1_dtype, arg2_dtype) + if res_dt: + return None, None, res_dt + + _fp16 = sycl_dev.has_aspect_fp16 + _fp64 = sycl_dev.has_aspect_fp64 + all_dts = _all_data_types(_fp16, _fp64) + for buf1_dt in all_dts: + for buf2_dt in all_dts: + if _can_cast(arg1_dtype, buf1_dt, _fp16, _fp64) and _can_cast( + arg2_dtype, buf2_dt, _fp16, _fp64 + ): + res_dt = query_fn(buf1_dt, buf2_dt) + if res_dt: + ret_buf1_dt = None if buf1_dt == arg1_dtype else buf1_dt + ret_buf2_dt = None if buf2_dt == arg2_dtype else buf2_dt + if ret_buf1_dt is None or ret_buf2_dt is None: + return ret_buf1_dt, ret_buf2_dt, res_dt + else: + acceptable = acceptance_fn( + arg1_dtype, + arg2_dtype, + ret_buf1_dt, + ret_buf2_dt, + res_dt, + sycl_dev, + ) + if acceptable: + return ret_buf1_dt, ret_buf2_dt, res_dt + else: + continue + + return None, None, None + + +def _find_buf_dtype_in_place_op(arg1_dtype, arg2_dtype, query_fn, sycl_dev): + res_dt = query_fn(arg1_dtype, arg2_dtype) + if res_dt: + return None, res_dt + + _fp16 = sycl_dev.has_aspect_fp16 + _fp64 = sycl_dev.has_aspect_fp64 + if _can_cast(arg2_dtype, arg1_dtype, _fp16, _fp64, casting="same_kind"): + res_dt = query_fn(arg1_dtype, arg1_dtype) + if res_dt: + return arg1_dtype, res_dt + + return None, None + + +def _get_device_default_dtype(dt_kind, sycl_dev): + if dt_kind == "b": + return dpt.dtype(ti.default_device_bool_type(sycl_dev)) + elif dt_kind == "i": + return dpt.dtype(ti.default_device_int_type(sycl_dev)) + elif dt_kind == "u": + return dpt.dtype(ti.default_device_uint_type(sycl_dev)) + elif dt_kind == "f": + return dpt.dtype(ti.default_device_fp_type(sycl_dev)) + elif dt_kind == "c": + return dpt.dtype(ti.default_device_complex_type(sycl_dev)) + raise RuntimeError + + +def _is_maximal_inexact_type(dt: dpt.dtype, _fp16: bool, _fp64: bool): + """ + Return True if data type `dt` is the + maximal size inexact data type + """ + if _fp64: + return dt in [dpt.float64, dpt.complex128] + return dt in [dpt.float32, dpt.complex64] + + +def _to_device_supported_dtype(dt, dev): + has_fp16 = dev.has_aspect_fp16 + has_fp64 = dev.has_aspect_fp64 + + return _to_device_supported_dtype_impl(dt, has_fp16, has_fp64) + + +def _to_device_supported_dtype_impl(dt, has_fp16, has_fp64): + if has_fp64: + if not has_fp16: + if dt is dpt.float16: + return dpt.float32 + else: + if dt is dpt.float64: + return dpt.float32 + elif dt is dpt.complex128: + return dpt.complex64 + if not has_fp16 and dt is dpt.float16: + return dpt.float32 + return dt + + +class WeakBooleanType: + """Python type representing type of Python boolean objects""" + + def __init__(self, o): + self.o_ = o + + def get(self): + return self.o_ + + +class WeakIntegralType: + """Python type representing type of Python integral objects""" + + def __init__(self, o): + self.o_ = o + + def get(self): + return self.o_ + + +class WeakFloatingType: + """Python type representing type of Python floating point objects""" + + def __init__(self, o): + self.o_ = o + + def get(self): + return self.o_ + + +class WeakComplexType: + """Python type representing type of Python complex floating point objects""" + + def __init__(self, o): + self.o_ = o + + def get(self): + return self.o_ + + +def _weak_type_num_kind(o): + _map = {"?": 0, "i": 1, "f": 2, "c": 3} + if isinstance(o, WeakBooleanType): + return _map["?"] + if isinstance(o, WeakIntegralType): + return _map["i"] + if isinstance(o, WeakFloatingType): + return _map["f"] + if isinstance(o, WeakComplexType): + return _map["c"] + raise TypeError( + f"Unexpected type {o} while expecting " + "`WeakBooleanType`, `WeakIntegralType`," + "`WeakFloatingType`, or `WeakComplexType`." + ) + + +def _strong_dtype_num_kind(o): + _map = {"b": 0, "i": 1, "u": 1, "f": 2, "c": 3} + if not isinstance(o, dpt.dtype): + raise TypeError + k = o.kind + if k in _map: + return _map[k] + raise ValueError(f"Unrecognized kind {k} for dtype {o}") + + +def _is_weak_dtype(dtype): + return isinstance( + dtype, + (WeakBooleanType, WeakIntegralType, WeakFloatingType, WeakComplexType), + ) + + +def _resolve_weak_types(o1_dtype, o2_dtype, dev): + """Resolves weak data type per NEP-0050""" + if _is_weak_dtype(o1_dtype): + if _is_weak_dtype(o2_dtype): + raise ValueError + o1_kind_num = _weak_type_num_kind(o1_dtype) + o2_kind_num = _strong_dtype_num_kind(o2_dtype) + if o1_kind_num > o2_kind_num: + if isinstance(o1_dtype, WeakIntegralType): + return dpt.dtype(ti.default_device_int_type(dev)), o2_dtype + if isinstance(o1_dtype, WeakComplexType): + if o2_dtype is dpt.float16 or o2_dtype is dpt.float32: + return dpt.complex64, o2_dtype + return ( + _to_device_supported_dtype(dpt.complex128, dev), + o2_dtype, + ) + return _to_device_supported_dtype(dpt.float64, dev), o2_dtype + else: + return o2_dtype, o2_dtype + elif _is_weak_dtype(o2_dtype): + o1_kind_num = _strong_dtype_num_kind(o1_dtype) + o2_kind_num = _weak_type_num_kind(o2_dtype) + if o2_kind_num > o1_kind_num: + if isinstance(o2_dtype, WeakIntegralType): + return o1_dtype, dpt.dtype(ti.default_device_int_type(dev)) + if isinstance(o2_dtype, WeakComplexType): + if o1_dtype is dpt.float16 or o1_dtype is dpt.float32: + return o1_dtype, dpt.complex64 + return o1_dtype, _to_device_supported_dtype(dpt.complex128, dev) + return ( + o1_dtype, + _to_device_supported_dtype(dpt.float64, dev), + ) + else: + return o1_dtype, o1_dtype + else: + return o1_dtype, o2_dtype + + +def _resolve_weak_types_all_py_ints(o1_dtype, o2_dtype, dev): + """ + Resolves weak data type per NEP-0050 for comparisons and + divide, where result type is known and special behavior + is needed to handle mixed integer kinds and Python integers + without overflow + """ + if _is_weak_dtype(o1_dtype): + if _is_weak_dtype(o2_dtype): + raise ValueError + o1_kind_num = _weak_type_num_kind(o1_dtype) + o2_kind_num = _strong_dtype_num_kind(o2_dtype) + if o1_kind_num > o2_kind_num: + if isinstance(o1_dtype, WeakIntegralType): + return dpt.dtype(ti.default_device_int_type(dev)), o2_dtype + if isinstance(o1_dtype, WeakComplexType): + if o2_dtype is dpt.float16 or o2_dtype is dpt.float32: + return dpt.complex64, o2_dtype + return ( + _to_device_supported_dtype(dpt.complex128, dev), + o2_dtype, + ) + return _to_device_supported_dtype(dpt.float64, dev), o2_dtype + else: + if o1_kind_num == o2_kind_num and isinstance( + o1_dtype, WeakIntegralType + ): + o1_val = o1_dtype.get() + o2_iinfo = dpt.iinfo(o2_dtype) + if (o1_val < o2_iinfo.min) or (o1_val > o2_iinfo.max): + return dpt.dtype(np.min_scalar_type(o1_val)), o2_dtype + return o2_dtype, o2_dtype + elif _is_weak_dtype(o2_dtype): + o1_kind_num = _strong_dtype_num_kind(o1_dtype) + o2_kind_num = _weak_type_num_kind(o2_dtype) + if o2_kind_num > o1_kind_num: + if isinstance(o2_dtype, WeakIntegralType): + return o1_dtype, dpt.dtype(ti.default_device_int_type(dev)) + if isinstance(o2_dtype, WeakComplexType): + if o1_dtype is dpt.float16 or o1_dtype is dpt.float32: + return o1_dtype, dpt.complex64 + return o1_dtype, _to_device_supported_dtype(dpt.complex128, dev) + return ( + o1_dtype, + _to_device_supported_dtype(dpt.float64, dev), + ) + else: + if o1_kind_num == o2_kind_num and isinstance( + o2_dtype, WeakIntegralType + ): + o2_val = o2_dtype.get() + o1_iinfo = dpt.iinfo(o1_dtype) + if (o2_val < o1_iinfo.min) or (o2_val > o1_iinfo.max): + return o1_dtype, dpt.dtype(np.min_scalar_type(o2_val)) + return o1_dtype, o1_dtype + else: + return o1_dtype, o2_dtype + + +def _resolve_one_strong_two_weak_types(st_dtype, dtype1, dtype2, dev): + """ + Resolves weak data types per NEP-0050, + where the second and third arguments are + permitted to be weak types + """ + if _is_weak_dtype(st_dtype): + raise ValueError + if _is_weak_dtype(dtype1): + if _is_weak_dtype(dtype2): + kind_num1 = _weak_type_num_kind(dtype1) + kind_num2 = _weak_type_num_kind(dtype2) + st_kind_num = _strong_dtype_num_kind(st_dtype) + + if kind_num1 > st_kind_num: + if isinstance(dtype1, WeakIntegralType): + ret_dtype1 = dpt.dtype(ti.default_device_int_type(dev)) + elif isinstance(dtype1, WeakComplexType): + if st_dtype is dpt.float16 or st_dtype is dpt.float32: + ret_dtype1 = dpt.complex64 + ret_dtype1 = _to_device_supported_dtype(dpt.complex128, dev) + else: + ret_dtype1 = _to_device_supported_dtype(dpt.float64, dev) + else: + ret_dtype1 = st_dtype + + if kind_num2 > st_kind_num: + if isinstance(dtype2, WeakIntegralType): + ret_dtype2 = dpt.dtype(ti.default_device_int_type(dev)) + elif isinstance(dtype2, WeakComplexType): + if st_dtype is dpt.float16 or st_dtype is dpt.float32: + ret_dtype2 = dpt.complex64 + ret_dtype2 = _to_device_supported_dtype(dpt.complex128, dev) + else: + ret_dtype2 = _to_device_supported_dtype(dpt.float64, dev) + else: + ret_dtype2 = st_dtype + + return ret_dtype1, ret_dtype2 + + max_dt_num_kind, max_dtype = max( + [ + (_strong_dtype_num_kind(st_dtype), st_dtype), + (_strong_dtype_num_kind(dtype2), dtype2), + ] + ) + dt1_kind_num = _weak_type_num_kind(dtype1) + if dt1_kind_num > max_dt_num_kind: + if isinstance(dtype1, WeakIntegralType): + return dpt.dtype(ti.default_device_int_type(dev)), dtype2 + if isinstance(dtype1, WeakComplexType): + if max_dtype is dpt.float16 or max_dtype is dpt.float32: + return dpt.complex64, dtype2 + return ( + _to_device_supported_dtype(dpt.complex128, dev), + dtype2, + ) + return _to_device_supported_dtype(dpt.float64, dev), dtype2 + else: + return max_dtype, dtype2 + elif _is_weak_dtype(dtype2): + max_dt_num_kind, max_dtype = max( + [ + (_strong_dtype_num_kind(st_dtype), st_dtype), + (_strong_dtype_num_kind(dtype1), dtype1), + ] + ) + dt2_kind_num = _weak_type_num_kind(dtype2) + if dt2_kind_num > max_dt_num_kind: + if isinstance(dtype2, WeakIntegralType): + return dtype1, dpt.dtype(ti.default_device_int_type(dev)) + if isinstance(dtype2, WeakComplexType): + if max_dtype is dpt.float16 or max_dtype is dpt.float32: + return dtype1, dpt.complex64 + return ( + dtype1, + _to_device_supported_dtype(dpt.complex128, dev), + ) + return dtype1, _to_device_supported_dtype(dpt.float64, dev) + else: + return dtype1, max_dtype + else: + # both are strong dtypes + # return unmodified + return dtype1, dtype2 + + +def _resolve_one_strong_one_weak_types(st_dtype, dtype, dev): + """Resolves one weak data type with one strong data type per NEP-0050""" + if _is_weak_dtype(st_dtype): + raise ValueError + if _is_weak_dtype(dtype): + st_kind_num = _strong_dtype_num_kind(st_dtype) + kind_num = _weak_type_num_kind(dtype) + if kind_num > st_kind_num: + if isinstance(dtype, WeakIntegralType): + return dpt.dtype(ti.default_device_int_type(dev)) + if isinstance(dtype, WeakComplexType): + if st_dtype is dpt.float16 or st_dtype is dpt.float32: + return dpt.complex64 + return _to_device_supported_dtype(dpt.complex128, dev) + return _to_device_supported_dtype(dpt.float64, dev) + else: + return st_dtype + else: + return dtype + + +class finfo_object: + """ + `numpy.finfo` subclass which returns Python floating-point scalars for + `eps`, `max`, `min`, and `smallest_normal` attributes. + """ + + def __init__(self, dtype): + _supported_dtype([dpt.dtype(dtype)]) + self._finfo = np.finfo(dtype) + + @property + def bits(self): + """Number of bits occupied by the real-valued floating-point data type.""" + return int(self._finfo.bits) + + @property + def smallest_normal(self): + """ + Smallest positive real-valued floating-point number with full + precision. + """ + return float(self._finfo.smallest_normal) + + @property + def tiny(self): + """An alias for `smallest_normal`""" + return float(self._finfo.tiny) + + @property + def eps(self): + """ + Difference between 1.0 and the next smallest representable real-valued + floating-point number larger than 1.0 according to the IEEE-754 + standard. + """ + return float(self._finfo.eps) + + @property + def epsneg(self): + """ + Difference between 1.0 and the next smallest representable real-valued + floating-point number smaller than 1.0 according to the IEEE-754 + standard. + """ + return float(self._finfo.epsneg) + + @property + def min(self): + """Smallest representable real-valued number.""" + return float(self._finfo.min) + + @property + def max(self): + """Largest representable real-valued number.""" + return float(self._finfo.max) + + @property + def resolution(self): + """The approximate decimal resolution of this type.""" + return float(self._finfo.resolution) + + @property + def precision(self): + """ + The approximate number of decimal digits to which this kind of + floating point type is precise. + """ + return float(self._finfo.precision) + + @property + def dtype(self): + """ + The dtype for which finfo returns information. For complex input, the + returned dtype is the associated floating point dtype for its real and + complex components. + """ + return self._finfo.dtype + + def __str__(self): + return self._finfo.__str__() + + def __repr__(self): + return self._finfo.__repr__() + + +def can_cast(from_, to, /, *, casting="safe") -> bool: + """can_cast(from, to, casting="safe") + + Determines if one data type can be cast to another data type according \ + to Type Promotion Rules. + + Args: + from_ (Union[usm_ndarray, dtype]): + source data type. If `from_` is an array, a device-specific type + promotion rules apply. + to (dtype): + target data type + casting (Optional[str]): + controls what kind of data casting may occur. + + * "no" means data types should not be cast at all. + * "safe" means only casts that preserve values are allowed. + * "same_kind" means only safe casts and casts within a kind, + like `float64` to `float32`, are allowed. + * "unsafe" means any data conversion can be done. + + Default: `"safe"`. + + Returns: + bool: + Gives `True` if cast can occur according to the casting rule. + + Device-specific type promotion rules take into account which data type are + and are not supported by a specific device. + """ + if isinstance(to, dpt.usm_ndarray): + raise TypeError(f"Expected `dpt.dtype` type, got {type(to)}.") + + dtype_to = dpt.dtype(to) + _supported_dtype([dtype_to]) + + if isinstance(from_, dpt.usm_ndarray): + dtype_from = from_.dtype + return _can_cast( + dtype_from, + dtype_to, + from_.sycl_device.has_aspect_fp16, + from_.sycl_device.has_aspect_fp64, + casting=casting, + ) + else: + dtype_from = dpt.dtype(from_) + _supported_dtype([dtype_from]) + # query casting as if all dtypes are supported + return _can_cast(dtype_from, dtype_to, True, True, casting=casting) + + +def result_type(*arrays_and_dtypes): + """ + result_type(*arrays_and_dtypes) + + Returns the dtype that results from applying the Type Promotion Rules to \ + the arguments. + + Args: + arrays_and_dtypes (Union[usm_ndarray, dtype]): + An arbitrary length sequence of usm_ndarray objects or dtypes. + + Returns: + dtype: + The dtype resulting from an operation involving the + input arrays and dtypes. + """ + dtypes = [] + devices = [] + weak_dtypes = [] + for arg_i in arrays_and_dtypes: + if isinstance(arg_i, dpt.usm_ndarray): + devices.append(arg_i.sycl_device) + dtypes.append(arg_i.dtype) + elif isinstance(arg_i, int): + weak_dtypes.append(WeakIntegralType(arg_i)) + elif isinstance(arg_i, float): + weak_dtypes.append(WeakFloatingType(arg_i)) + elif isinstance(arg_i, complex): + weak_dtypes.append(WeakComplexType(arg_i)) + elif isinstance(arg_i, bool): + weak_dtypes.append(WeakBooleanType(arg_i)) + else: + dt = dpt.dtype(arg_i) + _supported_dtype([dt]) + dtypes.append(dt) + + has_fp16 = True + has_fp64 = True + target_dev = None + if devices: + inspected = False + for d in devices: + if inspected: + unsame_fp16_support = d.has_aspect_fp16 != has_fp16 + unsame_fp64_support = d.has_aspect_fp64 != has_fp64 + if unsame_fp16_support or unsame_fp64_support: + raise ValueError( + "Input arrays reside on devices " + "with different device supports; " + "unable to determine which " + "device-specific type promotion rules " + "to use." + ) + else: + has_fp16 = d.has_aspect_fp16 + has_fp64 = d.has_aspect_fp64 + target_dev = d + inspected = True + + if not dtypes and weak_dtypes: + dtypes.append(weak_dtypes[0].get()) + + if not (has_fp16 and has_fp64): + for dt in dtypes: + if not _dtype_supported_by_device_impl(dt, has_fp16, has_fp64): + raise ValueError( + f"Argument {dt} is not supported by the device" + ) + res_dt = np.result_type(*dtypes) + res_dt = _to_device_supported_dtype_impl(res_dt, has_fp16, has_fp64) + for wdt in weak_dtypes: + pair = _resolve_weak_types(wdt, res_dt, target_dev) + res_dt = np.result_type(*pair) + res_dt = _to_device_supported_dtype_impl(res_dt, has_fp16, has_fp64) + else: + res_dt = np.result_type(*dtypes) + if weak_dtypes: + weak_dt_obj = [wdt.get() for wdt in weak_dtypes] + res_dt = np.result_type(res_dt, *weak_dt_obj) + + return res_dt + + +def iinfo(dtype, /): + """iinfo(dtype) + + Returns machine limits for integer data types. + + Args: + dtype (dtype, usm_ndarray): + integer dtype or + an array with integer dtype. + + Returns: + iinfo_object: + An object with the following attributes: + + * bits: int + number of bits occupied by the data type + * max: int + largest representable number. + * min: int + smallest representable number. + * dtype: dtype + integer data type. + """ + if isinstance(dtype, dpt.usm_ndarray): + dtype = dtype.dtype + _supported_dtype([dpt.dtype(dtype)]) + return np.iinfo(dtype) + + +def finfo(dtype, /): + """finfo(type) + + Returns machine limits for floating-point data types. + + Args: + dtype (dtype, usm_ndarray): floating-point dtype or + an array with floating point data type. + If complex, the information is about its component + data type. + + Returns: + finfo_object: + an object have the following attributes: + + * bits: int + number of bits occupied by dtype. + * eps: float + difference between 1.0 and the next smallest representable + real-valued floating-point number larger than 1.0 according + to the IEEE-754 standard. + * max: float + largest representable real-valued number. + * min: float + smallest representable real-valued number. + * smallest_normal: float + smallest positive real-valued floating-point number with + full precision. + * dtype: dtype + real-valued floating-point data type. + + """ + if isinstance(dtype, dpt.usm_ndarray): + dtype = dtype.dtype + _supported_dtype([dpt.dtype(dtype)]) + return finfo_object(dtype) + + +def _supported_dtype(dtypes): + for dtype in dtypes: + if dtype.char not in "?bBhHiIlLqQefdFD": + raise ValueError(f"Dpctl doesn't support dtype {dtype}.") + return True + + +def isdtype(dtype, kind): + """isdtype(dtype, kind) + + Returns a boolean indicating whether a provided `dtype` is + of a specified data type `kind`. + + See [array API](array_api) for more information. + + [array_api]: https://data-apis.org/array-api/latest/ + """ + + if not isinstance(dtype, np.dtype): + raise TypeError(f"Expected instance of `dpt.dtype`, got {dtype}") + + if isinstance(kind, np.dtype): + return dtype == kind + + elif isinstance(kind, str): + if kind == "bool": + return dtype == np.dtype("bool") + elif kind == "signed integer": + return dtype.kind == "i" + elif kind == "unsigned integer": + return dtype.kind == "u" + elif kind == "integral": + return dtype.kind in "iu" + elif kind == "real floating": + return dtype.kind == "f" + elif kind == "complex floating": + return dtype.kind == "c" + elif kind == "numeric": + return dtype.kind in "iufc" + else: + raise ValueError(f"Unrecognized data type kind: {kind}") + + elif isinstance(kind, tuple): + return any(isdtype(dtype, k) for k in kind) + + else: + raise TypeError(f"Unsupported data type kind: {kind}") + + +def _default_accumulation_dtype(inp_dt, q): + """Gives default output data type for given input data + type `inp_dt` when accumulation is performed on queue `q` + """ + inp_kind = inp_dt.kind + if inp_kind in "bi": + res_dt = dpt.dtype(ti.default_device_int_type(q)) + if inp_dt.itemsize > res_dt.itemsize: + res_dt = inp_dt + elif inp_kind in "u": + res_dt = dpt.dtype(ti.default_device_uint_type(q)) + res_ii = dpt.iinfo(res_dt) + inp_ii = dpt.iinfo(inp_dt) + if inp_ii.min >= res_ii.min and inp_ii.max <= res_ii.max: + pass + else: + res_dt = inp_dt + elif inp_kind in "fc": + res_dt = inp_dt + + return res_dt + + +def _default_accumulation_dtype_fp_types(inp_dt, q): + """Gives default output data type for given input data + type `inp_dt` when accumulation is performed on queue `q` + and the accumulation supports only floating-point data types + """ + inp_kind = inp_dt.kind + if inp_kind in "biu": + res_dt = dpt.dtype(ti.default_device_fp_type(q)) + can_cast_v = dpt.can_cast(inp_dt, res_dt) + if not can_cast_v: + _fp64 = q.sycl_device.has_aspect_fp64 + res_dt = dpt.float64 if _fp64 else dpt.float32 + elif inp_kind in "f": + res_dt = inp_dt + elif inp_kind in "c": + raise ValueError("function not defined for complex types") + + return res_dt + + +__all__ = [ + "_find_buf_dtype", + "_find_buf_dtype2", + "_to_device_supported_dtype", + "_acceptance_fn_default_unary", + "_acceptance_fn_round", + "_acceptance_fn_reciprocal", + "_acceptance_fn_default_binary", + "_acceptance_fn_divide", + "_acceptance_fn_negative", + "_acceptance_fn_subtract", + "_resolve_one_strong_one_weak_types", + "_resolve_one_strong_two_weak_types", + "_resolve_weak_types", + "_resolve_weak_types_all_py_ints", + "_weak_type_num_kind", + "_strong_dtype_num_kind", + "can_cast", + "finfo", + "iinfo", + "isdtype", + "result_type", + "WeakBooleanType", + "WeakIntegralType", + "WeakFloatingType", + "WeakComplexType", + "_default_accumulation_dtype", + "_default_accumulation_dtype_fp_types", + "_find_buf_dtype_in_place_op", +] diff --git a/dpnp/tensor/_types.pxi b/dpnp/tensor/_types.pxi new file mode 100644 index 000000000000..090750658f4b --- /dev/null +++ b/dpnp/tensor/_types.pxi @@ -0,0 +1,169 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +# these typenum values are aligned to values in NumPy +cdef: + int UAR_BOOL = 0 # pragma: no cover + int UAR_BYTE = 1 # pragma: no cover + int UAR_UBYTE = 2 # pragma: no cover + int UAR_SHORT = 3 # pragma: no cover + int UAR_USHORT = 4 # pragma: no cover + int UAR_INT = 5 # pragma: no cover + int UAR_UINT = 6 # pragma: no cover + int UAR_LONG = 7 # pragma: no cover + int UAR_ULONG = 8 # pragma: no cover + int UAR_LONGLONG = 9 # pragma: no cover + int UAR_ULONGLONG = 10 # pragma: no cover + int UAR_FLOAT = 11 # pragma: no cover + int UAR_DOUBLE = 12 # pragma: no cover + int UAR_CFLOAT = 14 # pragma: no cover + int UAR_CDOUBLE = 15 # pragma: no cover + int UAR_TYPE_SENTINEL = 17 # pragma: no cover + int UAR_HALF = 23 # pragma: no cover + +cdef int type_bytesize(int typenum): + """ + NPY_BOOL=0 : 1 + NPY_BYTE=1 : 1 + NPY_UBYTE=2 : 1 + NPY_SHORT=3 : 2 + NPY_USHORT=4 : 2 + NPY_INT=5 : sizeof(int) + NPY_UINT=6 : sizeof(unsigned int) + NPY_LONG=7 : sizeof(long) + NPY_ULONG=8 : sizeof(unsigned long) + NPY_LONGLONG=9 : 8 + NPY_ULONGLONG=10 : 8 + NPY_FLOAT=11 : 4 + NPY_DOUBLE=12 : 8 + NPY_LONGDOUBLE=13 : N/A + NPY_CFLOAT=14 : 8 + NPY_CDOUBLE=15 : 16 + NPY_CLONGDOUBLE=16 : N/A + NPY_HALF=23 : 2 + """ + cdef int *type_to_bytesize = [ + 1, + sizeof(char), + sizeof(unsigned char), + sizeof(short), + sizeof(unsigned short), + sizeof(int), + sizeof(unsigned int), + sizeof(long), + sizeof(unsigned long), + sizeof(long long), + sizeof(unsigned long long), + sizeof(float), + sizeof(double), -1, + sizeof(float complex), + sizeof(double complex), -1] + + if typenum < 0: # pragma: no cover + return -1 + if typenum > 16: + if typenum == 23: + return 2 + return -1 + + return type_to_bytesize[typenum] + + +cdef str _make_typestr(int typenum): + """ + Make typestring from type number + """ + cdef type_to_str = ["|b", "|i", "|u", "|i", "|u", + "|i", "|u", "|i", "|u", "|i", "|u", + "|f", "|f", "", "|c", "|c", ""] + + if (typenum < 0): # pragma: no cover + return "" + if (typenum > 16): + if (typenum == 23): + return "|f2" + return "" # pragma: no cover + + return type_to_str[typenum] + str(type_bytesize(typenum)) + + +cdef int typenum_from_format(str s): + """ + Internal utility to convert string describing type format + + Format is [<|=>][biufc]# + Shortcuts for formats are i, u, d, D + """ + if not s: + return -1 + try: + dt = np.dtype(s) + except Exception: + return -1 + if (dt.byteorder == ">"): + return -2 + return dt.num + + +cdef int descr_to_typenum(object dtype): + """ + Returns typenum for argumentd dtype that has attribute descr, + assumed numpy.dtype + """ + obj = getattr(dtype, "descr") + if (not isinstance(obj, list) or len(obj) != 1): + return -1 # token for ValueError + obj = obj[0] + if ( + not isinstance(obj, tuple) or len(obj) != 2 or obj[0] + ): # pragma: no cover + return -1 + obj = obj[1] + if not isinstance(obj, str): # pragma: no cover + return -1 + return typenum_from_format(obj) + + +cdef int dtype_to_typenum(dtype): + if isinstance(dtype, str): + return typenum_from_format(dtype) + elif isinstance(dtype, bytes): + return typenum_from_format(dtype.decode("UTF-8")) + elif hasattr(dtype, "descr"): + return descr_to_typenum(dtype) + else: + try: + dt = np.dtype(dtype) + except TypeError: + return -3 + except Exception: # pragma: no cover + return -1 + if hasattr(dt, "descr"): + return descr_to_typenum(dt) + else: # pragma: no cover + return -3 # token for TypeError diff --git a/dpnp/tensor/_usmarray.pxd b/dpnp/tensor/_usmarray.pxd new file mode 100644 index 000000000000..ccb8f4c796b7 --- /dev/null +++ b/dpnp/tensor/_usmarray.pxd @@ -0,0 +1,88 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +# distutils: language = c++ +# cython: language_level=3 + +cimport dpctl + + +cdef public api int USM_ARRAY_C_CONTIGUOUS +cdef public api int USM_ARRAY_F_CONTIGUOUS +cdef public api int USM_ARRAY_WRITABLE + +cdef public api int UAR_BOOL +cdef public api int UAR_BYTE +cdef public api int UAR_UBYTE +cdef public api int UAR_SHORT +cdef public api int UAR_USHORT +cdef public api int UAR_INT +cdef public api int UAR_UINT +cdef public api int UAR_LONG +cdef public api int UAR_ULONG +cdef public api int UAR_LONGLONG +cdef public api int UAR_ULONGLONG +cdef public api int UAR_FLOAT +cdef public api int UAR_DOUBLE +cdef public api int UAR_CFLOAT +cdef public api int UAR_CDOUBLE +cdef public api int UAR_TYPE_SENTINEL +cdef public api int UAR_HALF + + +cdef api class usm_ndarray [object PyUSMArrayObject, type PyUSMArrayType]: + # data fields + cdef char* data_ + cdef int nd_ + cdef Py_ssize_t *shape_ + cdef Py_ssize_t *strides_ + cdef int typenum_ + cdef int flags_ + cdef object base_ + cdef object array_namespace_ + # make usm_ndarray weak-referenceable + cdef object __weakref__ + + cdef void _reset(usm_ndarray self) + cdef void _cleanup(usm_ndarray self) + cdef Py_ssize_t get_offset(usm_ndarray self) except * + + cdef char* get_data(self) + cdef int get_ndim(self) + cdef Py_ssize_t * get_shape(self) + cdef Py_ssize_t * get_strides(self) + cdef int get_typenum(self) + cdef int get_itemsize(self) + cdef int get_flags(self) + cdef object get_base(self) + cdef dpctl.DPCTLSyclQueueRef get_queue_ref(self) except * + cdef dpctl.SyclQueue get_sycl_queue(self) + + cdef _set_writable_flag(self, int) + + cdef __cythonbufferdefaults__ = {"mode": "strided"} diff --git a/dpnp/tensor/_usmarray.pyx b/dpnp/tensor/_usmarray.pyx new file mode 100644 index 000000000000..c696056d53c2 --- /dev/null +++ b/dpnp/tensor/_usmarray.pyx @@ -0,0 +1,1745 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +# distutils: language = c++ +# cython: language_level=3 +# cython: linetrace=True + +import dpctl +import dpctl.memory as dpmem +import numpy as np + +from dpctl._backend cimport DPCTLSyclUSMRef +from dpctl._sycl_device_factory cimport _cached_default_device + +import dpnp + +from ._data_types import bool as dpt_bool +from ._device import Device +from ._print import usm_ndarray_repr, usm_ndarray_str + +cimport dpctl as c_dpctl +cimport dpctl.memory as c_dpmem +from cpython.mem cimport PyMem_Free +from cpython.tuple cimport PyTuple_New, PyTuple_SetItem + +from . cimport _dlpack as c_dlpack + +from enum import IntEnum + +from . import _flags +from ._dlpack import get_build_dlpack_version +from ._tensor_impl import default_device_fp_type + +include "_stride_utils.pxi" +include "_types.pxi" +include "_slicing.pxi" + + +class DLDeviceType(IntEnum): + """ + An :class:`enum.IntEnum` for the types of DLDevices supported by the DLPack + protocol. + + ``kDLCPU``: + CPU (host) device + ``kDLCUDA``: + CUDA GPU device + ``kDLCUDAHost``: + Pinned CUDA CPU memory by cudaMallocHost + ``kDLOpenCL``: + OpenCL device + ``kDLVulkan``: + Vulkan buffer + ``kDLMetal``: + Metal for Apple GPU + ``kDLVPI``: + Verilog simulator buffer + ``kDLROCM``: + ROCm GPU device + ``kDLROCMHost``: + Pinned ROCm CPU memory allocated by hipMallocHost + ``kDLExtDev``: + Reserved extension device type used to test new devices + ``kDLCUDAManaged``: + CUDA managed/unified memory allocated by cudaMallocManaged + ``kDLOneAPI``: + Unified shared memory allocated on a oneAPI non-partitioned device + ``kDLWebGPU``: + Device support for WebGPU standard + ``kDLHexagon``: + Qualcomm Hexagon DSP + ``kDLMAIA``: + Microsoft MAIA device + ``kDLTrn``: + AWS Trainium device + """ + kDLCPU = c_dlpack.device_CPU + kDLCUDA = c_dlpack.device_CUDA + kDLCUDAHost = c_dlpack.device_CUDAHost + kDLCUDAManaged = c_dlpack.device_CUDAManaged + kDLROCM = c_dlpack.device_DLROCM + kDLROCMHost = c_dlpack.device_ROCMHost + kDLOpenCL = c_dlpack.device_OpenCL + kDLVulkan = c_dlpack.device_Vulkan + kDLMetal = c_dlpack.device_Metal + kDLVPI = c_dlpack.device_VPI + kDLOneAPI = c_dlpack.device_OneAPI + kDLWebGPU = c_dlpack.device_WebGPU + kDLHexagon = c_dlpack.device_Hexagon + kDLMAIA = c_dlpack.device_MAIA + kDLTrn = c_dlpack.device_Trn + + +cdef class InternalUSMArrayError(Exception): + """ + An InternalUSMArrayError exception is raised when internal + inconsistency has been detected in :class:`.usm_ndarray`. + """ + pass + + +cdef object _as_zero_dim_ndarray(object usm_ary): + "Convert size-1 array to NumPy 0d array" + mem_view = dpmem.as_usm_memory(usm_ary) + usm_ary.sycl_queue.wait() + host_buf = mem_view.copy_to_host() + view = host_buf.view(usm_ary.dtype) + view.shape = tuple() + return view + + +cdef inline void _check_0d_scalar_conversion(object usm_ary) except *: + "Raise TypeError if array cannot be converted to a Python scalar" + if (usm_ary.ndim != 0): + raise TypeError( + "only 0-dimensional arrays can be converted to Python scalars" + ) + + +cdef int _copy_writable(int lhs_flags, int rhs_flags): + "Copy the WRITABLE flag to lhs_flags from rhs_flags" + return (lhs_flags & ~USM_ARRAY_WRITABLE) | (rhs_flags & USM_ARRAY_WRITABLE) + + +cdef bint _is_host_cpu(object dl_device): + "Check if dl_device denotes (kDLCPU, 0)" + cdef object dl_type + cdef object dl_id + cdef Py_ssize_t n_elems = -1 + + try: + n_elems = len(dl_device) + except TypeError: + pass + + if n_elems != 2: + return False + + dl_type = dl_device[0] + dl_id = dl_device[1] + if isinstance(dl_type, str): + return (dl_type == "kDLCPU" and dl_id == 0) + + return (dl_type == DLDeviceType.kDLCPU) and (dl_id == 0) + + +cdef void _validate_and_use_stream( + object stream, c_dpctl.SyclQueue self_queue +) except *: + if (stream is None or stream == self_queue): + pass + else: + if not isinstance(stream, dpctl.SyclQueue): + raise TypeError( + "stream argument type was expected to be dpctl.SyclQueue," + f" got {type(stream)} instead" + ) + ev = self_queue.submit_barrier() + stream.submit_barrier(dependent_events=[ev]) + +cdef class usm_ndarray: + """ usm_ndarray(shape, dtype=None, strides=None, buffer="device", \ + offset=0, order="C", buffer_ctor_kwargs=dict(), \ + array_namespace=None) + + An array object represents a multidimensional tensor of numeric + elements stored in a USM allocation on a SYCL device. + + Arg: + shape (int, tuple): + Shape of the array to be created. + dtype (str, dtype): + Array data type, i.e. the type of array elements. + If ``dtype`` has the value ``None``, it is determined by default + floating point type supported by target device. + The supported types are + + ``bool``: + boolean type + ``int8``, ``int16``, ``int32``, ``int64``: + signed integer types + ``uint8``, ``uint16``, ``uint32``, ``uint64``: + unsigned integer types + ``float16``: + half-precision floating type, + supported if target device's property + ``has_aspect_fp16`` is ``True`` + ``float32``, ``complex64``: + single-precision real and complex floating types + ``float64``, ``complex128``: + double-precision real and complex floating + types, supported if target device's property + ``has_aspect_fp64`` is ``True``. + + Default: ``None``. + strides (tuple, optional): + Strides of the array to be created in elements. + If ``strides`` has the value ``None``, it is determined by the + ``shape`` of the array and the requested ``order``. + Default: ``None``. + buffer (str, object, optional): + A string corresponding to the type of USM allocation to make, + or a Python object representing a USM memory allocation, i.e. + :class:`dpctl.memory.MemoryUSMDevice`, + :class:`dpctl.memory.MemoryUSMShared`, or + :class:`dpctl.memory.MemoryUSMHost`. Recognized strings are + ``"device"``, ``"shared"``, or ``"host"``. Additional arguments to + the USM memory allocators can be passed in a dictionary specified + via ``buffer_ctor_kwrds`` keyword parameter. + Default: ``"device"``. + offset (int, optional): + Offset of the array element with all zero indexes relative to the + start of the provided `buffer` in elements. The argument is ignored + if the ``buffer`` value is a string and the memory is allocated by + the constructor. Default: ``0``. + order ({"C", "F"}, optional): + The memory layout of the array when constructing using a new + allocation. Value ``"C"`` corresponds to C-contiguous, or row-major + memory layout, while value ``"F"`` corresponds to F-contiguous, or + column-major layout. Default: ``"C"``. + buffer_ctor_kwargs (dict, optional): + Dictionary with keyword parameters to use when creating a new USM + memory allocation. See :class:`dpctl.memory.MemoryUSMShared` for + supported keyword arguments. + array_namespace (module, optional): + Array namespace module associated with this array. + Default: ``None``. + + ``buffer`` can be ``"shared"``, ``"host"``, ``"device"`` to allocate + new device memory by calling respective constructor with + the specified ``buffer_ctor_kwrds``; ``buffer`` can be an + instance of :class:`dpctl.memory.MemoryUSMShared`, + :class:`dpctl.memory.MemoryUSMDevice`, or + :class:`dpctl.memory.MemoryUSMHost`; ``buffer`` can also be + another :class:`dpctl.tensor.usm_ndarray` instance, in which case its + underlying ``MemoryUSM*`` buffer is used. + """ + + cdef void _reset(usm_ndarray self): + """ + Initializes member fields + """ + self.base_ = None + self.array_namespace_ = None + self.nd_ = -1 + self.data_ = 0 + self.shape_ = 0 + self.strides_ = 0 + self.flags_ = 0 + + cdef void _cleanup(usm_ndarray self): + if (self.shape_): + PyMem_Free(self.shape_) + if (self.strides_): + PyMem_Free(self.strides_) + self._reset() + + def __cinit__(self, shape, dtype=None, strides=None, buffer="device", + Py_ssize_t offset=0, order="C", + buffer_ctor_kwargs=dict(), + array_namespace=None): + """ + strides and offset must be given in units of array elements. + buffer can be strings ('device'|'shared'|'host' to allocate new memory) + or ``dpctl.memory.MemoryUSM*`` buffers, or ``usm_ndarray`` instances. + """ + cdef int nd = 0 + cdef int typenum = 0 + cdef int itemsize = 0 + cdef int err = 0 + cdef int contig_flag = 0 + cdef int writable_flag = USM_ARRAY_WRITABLE + cdef Py_ssize_t *shape_ptr = NULL + cdef Py_ssize_t ary_nelems = 0 + cdef Py_ssize_t ary_nbytes = 0 + cdef Py_ssize_t *strides_ptr = NULL + cdef Py_ssize_t _offset = offset + cdef Py_ssize_t ary_min_displacement = 0 + cdef Py_ssize_t ary_max_displacement = 0 + cdef bint is_fp64 = False + cdef bint is_fp16 = False + + self._reset() + if not isinstance(shape, (list, tuple)): + if hasattr(shape, "tolist"): + fn = getattr(shape, "tolist") + if callable(fn): + shape = shape.tolist() + if not isinstance(shape, (list, tuple)): + try: + shape + shape = [shape, ] + except Exception as e: + raise TypeError( + "Argument shape must a non-negative integer, " + "or a list/tuple of such integers." + ) from e + nd = len(shape) + if dtype is None: + if isinstance(buffer, (dpmem._memory._Memory, usm_ndarray)): + q = buffer.sycl_queue + else: + q = buffer_ctor_kwargs.get("queue") + if q is not None: + dtype = default_device_fp_type(q) + else: + dev = _cached_default_device() + dtype = "f8" if dev.has_aspect_fp64 else "f4" + typenum = dtype_to_typenum(dtype) + if (typenum < 0): + if typenum == -2: + raise ValueError( + "Data type '" + str(dtype) + + "' can only have native byteorder." + ) + elif typenum == -1: + raise ValueError( + "Data type '" + str(dtype) + "' is not understood." + ) + raise TypeError( + f"Expected string or a dtype object, got {type(dtype)}" + ) + itemsize = type_bytesize(typenum) + if (itemsize < 1): + raise TypeError( + "dtype=" + np.dtype(dtype).name + " is not supported." + ) + # allocate host C-arrays for shape, strides + err = _from_input_shape_strides( + nd, shape, strides, itemsize, ord(order), + &shape_ptr, &strides_ptr, &ary_nelems, + &ary_min_displacement, &ary_max_displacement, &contig_flag + ) + if (err): + self._cleanup() + if err == ERROR_MALLOC: + raise MemoryError("Memory allocation for shape/strides " + "array failed.") + elif err == ERROR_INCORRECT_ORDER: + raise ValueError( + "Unsupported order='{}' given. " + "Supported values are 'C' or 'F'.".format(order)) + elif err == ERROR_UNEXPECTED_STRIDES: + raise ValueError( + "strides={} is not understood".format(strides)) + else: + raise InternalUSMArrayError( + " .. while processing shape and strides.") + ary_nbytes = (ary_max_displacement - + ary_min_displacement + 1) * itemsize + if isinstance(buffer, dpmem._memory._Memory): + _buffer = buffer + elif isinstance(buffer, (str, bytes)): + if isinstance(buffer, bytes): + buffer = buffer.decode("UTF-8") + _offset = -ary_min_displacement + if (buffer == "shared"): + _buffer = dpmem.MemoryUSMShared(ary_nbytes, + **buffer_ctor_kwargs) + elif (buffer == "device"): + _buffer = dpmem.MemoryUSMDevice(ary_nbytes, + **buffer_ctor_kwargs) + elif (buffer == "host"): + _buffer = dpmem.MemoryUSMHost(ary_nbytes, + **buffer_ctor_kwargs) + else: + self._cleanup() + raise ValueError( + "buffer='{}' is not understood. " + "Recognized values are 'device', 'shared', 'host', " + "an instance of `MemoryUSM*` object, or a usm_ndarray" + "".format(buffer) + ) + elif isinstance(buffer, usm_ndarray): + if not buffer.flags.writable: + writable_flag = 0 + _buffer = buffer.usm_data + else: + self._cleanup() + raise ValueError("buffer='{}' was not understood.".format(buffer)) + if (shape_to_elem_count(nd, shape_ptr) > 0 and + (_offset + ary_min_displacement < 0 or + (_offset + ary_max_displacement + 1) * itemsize > _buffer.nbytes)): + self._cleanup() + raise ValueError(("buffer='{}' can not accommodate " + "the requested array.").format(buffer)) + is_fp64 = (typenum == UAR_DOUBLE or typenum == UAR_CDOUBLE) + is_fp16 = (typenum == UAR_HALF) + if (is_fp64 or is_fp16): + if ( + (is_fp64 and not _buffer.sycl_device.has_aspect_fp64) or + (is_fp16 and not _buffer.sycl_device.has_aspect_fp16) + ): + raise ValueError( + f"Device {_buffer.sycl_device.name} does" + f" not support {dtype} natively." + ) + self.base_ = _buffer + self.data_ = ( ( _buffer._pointer)) + itemsize * _offset + self.shape_ = shape_ptr + self.strides_ = strides_ptr + self.typenum_ = typenum + self.flags_ = (contig_flag | writable_flag) + self.nd_ = nd + self.array_namespace_ = array_namespace + + def __dealloc__(self): + self._cleanup() + + @property + def _pointer(self): + """ + Returns USM pointer to the start of array (element with zero + multi-index) encoded as integer. + """ + return self.get_data() + + cdef Py_ssize_t get_offset(self) except *: + cdef char *mem_ptr = NULL + cdef char *ary_ptr = self.get_data() + mem_ptr = ( self.base_._pointer) + byte_offset = ary_ptr - mem_ptr + item_size = self.get_itemsize() + if (byte_offset % item_size): + raise InternalUSMArrayError( + "byte_offset is not a multiple of item_size.") + return byte_offset // item_size + + @property + def _element_offset(self): + """Returns the offset of the zero-index element of the array, in + elements, relative to the start of memory allocation""" + return self.get_offset() + + @property + def _byte_bounds(self): + """Returns a 2-tuple with pointers to the end-points of the array + + :Example: + + .. code-block:: python + + from dpnp import tensor + + x = tensor.ones((3, 10, 7)) + y = tensor.flip(x[:, 1::2], axis=1) + + beg_p, end_p = y._byte_bounds + # Bytes taken to store this array + bytes_extent = end_p - beg_p + + # C-contiguous copy is more compact + yc = tensor.copy(y, order="C") + beg_pc, end_pc = yc._byte_bounds + assert bytes_extent < end_pc - beg_pc + """ + cdef Py_ssize_t min_disp = 0 + cdef Py_ssize_t max_disp = 0 + cdef Py_ssize_t step_ = 0 + cdef Py_ssize_t dim_ = 0 + cdef int it = 0 + cdef Py_ssize_t _itemsize = self.get_itemsize() + + if ( + (self.flags_ & USM_ARRAY_C_CONTIGUOUS) + or (self.flags_ & USM_ARRAY_F_CONTIGUOUS) + ): + return ( + self._pointer, + self._pointer + shape_to_elem_count( + self.nd_, self.shape_ + ) * _itemsize + ) + + for it in range(self.nd_): + dim_ = self.shape[it] + if dim_ > 0: + step_ = self.strides[it] + if step_ > 0: + max_disp += step_ * (dim_ - 1) + else: + min_disp += step_ * (dim_ - 1) + + return ( + self._pointer + min_disp * _itemsize, + self._pointer + (max_disp + 1) * _itemsize + ) + + cdef char* get_data(self): + """Returns the USM pointer for this array.""" + return self.data_ + + cdef int get_ndim(self): + """ + Returns the number of indices needed to address + an element of this array. + """ + return self.nd_ + + cdef Py_ssize_t* get_shape(self): + """ + Returns pointer to shape C-array for this array. + + C-array has at least ``ndim`` non-negative elements, + which determine the range of permissible indices + addressing individual elements of this array. + """ + return self.shape_ + + cdef Py_ssize_t* get_strides(self): + """ + Returns pointer to strides C-array for this array. + + The pointer can be NULL (contiguous array), or the + array size is at least ``ndim`` elements + """ + return self.strides_ + + cdef int get_typenum(self): + """Returns typenum corresponding to values of this array""" + return self.typenum_ + + cdef int get_itemsize(self): + """ + Returns itemsize of this arrays in bytes + """ + return type_bytesize(self.typenum_) + + cdef int get_flags(self): + """Returns flags of this array""" + return self.flags_ + + cdef object get_base(self): + """Returns the object owning the USM data addressed by this array""" + return self.base_ + + cdef c_dpctl.SyclQueue get_sycl_queue(self): + cdef c_dpmem._Memory mem + if not isinstance(self.base_, dpctl.memory._Memory): + raise InternalUSMArrayError( + "This array has unexpected memory owner" + ) + mem = self.base_ + return mem.queue + + cdef c_dpctl.DPCTLSyclQueueRef get_queue_ref(self) except *: + """ + Returns a copy of DPCTLSyclQueueRef associated with array + """ + cdef c_dpctl.SyclQueue q = self.get_sycl_queue() + cdef c_dpctl.DPCTLSyclQueueRef QRef = q.get_queue_ref() + cdef c_dpctl.DPCTLSyclQueueRef QRefCopy = NULL + if QRef is not NULL: + QRefCopy = c_dpctl.DPCTLQueue_Copy(QRef) + return QRefCopy + else: + raise InternalUSMArrayError( + "Memory owner of this array is corrupted" + ) + + @property + def __sycl_usm_array_interface__(self): + """ + Gives ``__sycl_usm_array_interface__`` dictionary describing + the array. + """ + cdef Py_ssize_t byte_offset = -1 + cdef int item_size = -1 + cdef Py_ssize_t elem_offset = -1 + cdef char *mem_ptr = NULL + cdef char *ary_ptr = NULL + if (not isinstance(self.base_, dpmem._memory._Memory)): + raise InternalUSMArrayError( + "Invalid instance of usm_ndarray encountered. " + "Private field base_ has an unexpected type {}.".format( + type(self.base_) + ) + ) + ary_iface = self.base_.__sycl_usm_array_interface__ + mem_ptr = ( ary_iface["data"][0]) + ary_ptr = ( self.data_) + ro_flag = False if (self.flags_ & USM_ARRAY_WRITABLE) else True + ary_iface["data"] = ( mem_ptr, ro_flag) + ary_iface["shape"] = self.shape + if (self.strides_): + ary_iface["strides"] = _make_int_tuple(self.nd_, self.strides_) + else: + if (self.flags_ & USM_ARRAY_C_CONTIGUOUS): + ary_iface["strides"] = None + elif (self.flags_ & USM_ARRAY_F_CONTIGUOUS): + ary_iface["strides"] = _f_contig_strides(self.nd_, self.shape_) + else: + raise InternalUSMArrayError( + "USM Array is not contiguous and has empty strides" + ) + ary_iface["typestr"] = _make_typestr(self.typenum_) + byte_offset = ary_ptr - mem_ptr + item_size = self.get_itemsize() + if (byte_offset % item_size): + raise InternalUSMArrayError( + "byte_offset is not a multiple of item_size.") + elem_offset = byte_offset // item_size + ary_iface["offset"] = elem_offset + # must wait for content of the memory to finalize + self.sycl_queue.wait() + return ary_iface + + @property + def ndim(self): + """ + Gives the number of indices needed to address elements of this array. + """ + return self.nd_ + + @property + def usm_data(self): + """ + Gives USM memory object underlying :class:`.usm_ndarray` instance. + """ + return self.get_base() + + @property + def shape(self): + """ + Elements of the shape tuple give the lengths of the + respective array dimensions. + + Setting shape is allowed only when reshaping to the requested + dimensions can be returned as view, otherwise :exc:`AttributeError` + is raised. Use :func:`dpctl.tensor.reshape` to reshape the array + in all cases. + + :Example: + + .. code-block:: python + + from dpnp import tensor + + x = tensor.arange(899) + x.shape = (29, 31) + """ + if self.nd_ > 0: + return _make_int_tuple(self.nd_, self.shape_) + else: + return tuple() + + @shape.setter + def shape(self, new_shape): + """ + Modifies usm_ndarray instance in-place by changing its metadata + about the shape and the strides of the array, or raises + `AttributeError` exception if in-place change is not possible. + + Args: + new_shape: (tuple, int) + New shape. Only non-negative values are supported. + The new shape may not lead to the change in the + number of elements in the array. + + Whether the array can be reshape in-place depends on its + strides. Use :func:`dpctl.tensor.reshape` function which + always succeeds to reshape the array by performing a copy + if necessary. + """ + cdef int new_nd = -1 + cdef Py_ssize_t nelems = -1 + cdef int err = 0 + cdef Py_ssize_t min_disp = 0 + cdef Py_ssize_t max_disp = 0 + cdef int contig_flag = 0 + cdef Py_ssize_t *shape_ptr = NULL + cdef Py_ssize_t *strides_ptr = NULL + cdef Py_ssize_t size = -1 + import operator + + from ._reshape import reshaped_strides + + try: + new_nd = len(new_shape) + except TypeError: + new_nd = 1 + new_shape = (new_shape,) + try: + new_shape = tuple(operator.index(dim) for dim in new_shape) + except TypeError: + raise TypeError( + "Target shape must be a finite iterable of integers" + ) + size = shape_to_elem_count(self.nd_, self.shape_) + if not np.prod(new_shape) == size: + raise TypeError( + f"Can not reshape array of size {self.size} into {new_shape}" + ) + if size > 0: + new_strides = reshaped_strides( + self.shape, + self.strides, + new_shape + ) + else: + new_strides = (1,) * len(new_shape) + if new_strides is None: + raise AttributeError( + "Incompatible shape for in-place modification. " + "Use `reshape()` to make a copy with the desired shape." + ) + err = _from_input_shape_strides( + new_nd, new_shape, new_strides, + self.get_itemsize(), + b"C", + &shape_ptr, &strides_ptr, + &nelems, &min_disp, &max_disp, &contig_flag + ) + if (err == 0): + if (self.shape_): + PyMem_Free(self.shape_) + if (self.strides_): + PyMem_Free(self.strides_) + self.flags_ = (contig_flag | (self.flags_ & USM_ARRAY_WRITABLE)) + self.nd_ = new_nd + self.shape_ = shape_ptr + self.strides_ = strides_ptr + else: + raise InternalUSMArrayError( + "Encountered in shape setter, error code {err}".format(err) + ) + + @property + def strides(self): + """ + Returns memory displacement in array elements, upon unit + change of respective index. + + For example, for strides ``(s1, s2, s3)`` and multi-index + ``(i1, i2, i3)`` position of the respective element relative + to zero multi-index element is ``s1*s1 + s2*i2 + s3*i3``. + + :Example: + + .. code-block:: python + + from dpnp import tensor + + x = tensor.zeros((20, 30)) + xv = x[10:, :15] + + multi_id = (3, 5) + byte_displacement = xv[multi_id]._pointer - xv[0, 0]._pointer + element_displacement = sum( + i * s for i, s in zip(multi_id, xv.strides) + ) + assert byte_displacement == element_displacement * xv.itemsize + """ + if (self.strides_): + return _make_int_tuple(self.nd_, self.strides_) + else: + if (self.flags_ & USM_ARRAY_C_CONTIGUOUS): + return _c_contig_strides(self.nd_, self.shape_) + elif (self.flags_ & USM_ARRAY_F_CONTIGUOUS): + return _f_contig_strides(self.nd_, self.shape_) + else: + raise ValueError("Inconsistent usm_ndarray data") + + @property + def flags(self): + """ + Returns :class:`dpctl.tensor._flags.Flags` object. + """ + return _flags.Flags(self, self.flags_) + + cdef _set_writable_flag(self, int flag): + cdef int mask = (USM_ARRAY_WRITABLE if flag else 0) + self.flags_ = _copy_writable(self.flags_, mask) + + @property + def usm_type(self): + """ + USM type of underlying memory. Possible values are: + + * ``"device"`` + USM-device allocation in device memory, only accessible + to kernels executed on the device + * ``"shared"`` + USM-shared allocation in device memory, accessible both + from the device and from host + * ``"host"`` + USM-host allocation in host memory, accessible both + from the device and from host + + See: https://docs.oneapi.com/versions/latest/dpcpp/iface/usm.html + """ + return self.base_.get_usm_type() + + @property + def itemsize(self): + """ + Size of array element in bytes. + """ + return self.get_itemsize() + + @property + def nbytes(self): + """ + Total bytes consumed by the elements of the array. + """ + return ( + shape_to_elem_count(self.nd_, self.shape_) * + self.get_itemsize()) + + @property + def size(self): + """ + Number of elements in the array. + """ + return shape_to_elem_count(self.nd_, self.shape_) + + @property + def dtype(self): + """ + Returns NumPy's dtype corresponding to the type of the array elements. + """ + return np.dtype(_make_typestr(self.typenum_)) + + @property + def sycl_queue(self): + """ + Returns :class:`dpctl.SyclQueue` object associated with USM data. + """ + return self.get_sycl_queue() + + @property + def sycl_device(self): + """ + Returns :class:`dpctl.SyclDevice` object on which USM data + was allocated. + """ + q = self.sycl_queue + return q.sycl_device + + @property + def device(self): + """ + Returns :class:`dpctl.tensor.Device` object representing + residence of the array data. + + The ``Device`` object represents Array API notion of the + device, and contains :class:`dpctl.SyclQueue` associated + with this array. Hence, ``.device`` property provides + information distinct from ``.sycl_device`` property. + + :Example: + + .. code-block:: python + + >>> from dpnp import tensor + >>> x = tensor.ones(10) + >>> x.device + Device(level_zero:gpu:0) + """ + return Device.create_device(self.sycl_queue) + + @property + def sycl_context(self): + """ + Returns :class:`dpctl.SyclContext` object to which USM data is bound. + """ + q = self.sycl_queue + return q.sycl_context + + @property + def T(self): + """Returns transposed array for 2D array, raises ``ValueError`` + otherwise. + """ + if self.nd_ == 2: + return _transpose(self) + else: + raise ValueError( + "array.T requires array to have 2 dimensions. " + "Use array.mT to transpose stacks of matrices and " + "dpnp.tensor.permute_dims() to permute dimensions." + ) + + @property + def mT(self): + """ Returns array (a view) where the last two dimensions are + transposed. + """ + if self.nd_ < 2: + raise ValueError( + "array.mT requires array to have at least 2 dimensions." + ) + return _m_transpose(self) + + @property + def real(self): + """ + Returns view into real component for arrays with + complex data-types and returns itself for all other + data-types. + + :Example: + + .. code-block:: python + + from dpnp import tensor + + # Create complex array from + # arrays of real and imaginary parts + + re = tensor.linspace(-1, 1, num=100, dtype="f4") + im = tensor.full_like(re, fill_value=tensor.pi) + + z = tensor.empty_like(re, dtype="c8") + z.real[:] = re + z.imag[:] = im + """ + # explicitly check for UAR_HALF, which is greater than UAR_CFLOAT + if (self.typenum_ < UAR_CFLOAT or self.typenum_ == UAR_HALF): + # elements are real + return self + if (self.typenum_ < UAR_TYPE_SENTINEL): + return _real_view(self) + + @property + def imag(self): + """ Returns view into imaginary component for arrays with + complex data-types and returns new zero array for all other + data-types. + + :Example: + + .. code-block:: python + + from dpnp import tensor + + # Reset imaginary part of complex array + + z = tensor.ones(100, dtype="c8") + z.imag[:] = dpt.pi/2 + """ + # explicitly check for UAR_HALF, which is greater than UAR_CFLOAT + if (self.typenum_ < UAR_CFLOAT or self.typenum_ == UAR_HALF): + # elements are real + return _zero_like(self) + if (self.typenum_ < UAR_TYPE_SENTINEL): + return _imag_view(self) + + def __getitem__(self, ind): + cdef tuple _meta = _basic_slice_meta( + ind, (self).shape, ( self).strides, + self.get_offset()) + cdef usm_ndarray res + cdef int i = 0 + cdef bint matching = 1 + + if len(_meta) < 5: + raise RuntimeError + + res = usm_ndarray.__new__( + usm_ndarray, + _meta[0], + dtype=_make_typestr(self.typenum_), + strides=_meta[1], + buffer=self.base_, + offset=_meta[2] + ) + res.array_namespace_ = self.array_namespace_ + + adv_ind = _meta[3] + adv_ind_start_p = _meta[4] + + if adv_ind_start_p < 0: + res.flags_ = _copy_writable(res.flags_, self.flags_) + return res + + from ._copy_utils import _extract_impl, _nonzero_impl, _take_multi_index + + # if len(adv_ind == 1), the (only) element is always an array + if len(adv_ind) == 1 and adv_ind[0].dtype == dpt_bool: + key_ = adv_ind[0] + adv_ind_end_p = key_.ndim + adv_ind_start_p + if adv_ind_end_p > res.ndim: + raise IndexError("too many indices for the array") + key_shape = key_.shape + arr_shape = res.shape[adv_ind_start_p:adv_ind_end_p] + for i in range(key_.ndim): + if matching: + if not key_shape[i] == arr_shape[i] and key_shape[i] > 0: + matching = 0 + if not matching: + raise IndexError( + "boolean index did not match indexed array in dimensions" + ) + res = _extract_impl(res, key_, axis=adv_ind_start_p) + res.flags_ = _copy_writable(res.flags_, self.flags_) + return res + + if any( + ( + isinstance(ind, usm_ndarray) and ind.dtype == dpt_bool + ) for ind in adv_ind + ): + adv_ind_int = list() + for ind in adv_ind: + if isinstance(ind, usm_ndarray) and ind.dtype == dpt_bool: + adv_ind_int.extend(_nonzero_impl(ind)) + else: + adv_ind_int.append(ind) + res = _take_multi_index(res, tuple(adv_ind_int), adv_ind_start_p) + res.flags_ = _copy_writable(res.flags_, self.flags_) + return res + + res = _take_multi_index(res, adv_ind, adv_ind_start_p) + res.flags_ = _copy_writable(res.flags_, self.flags_) + return res + + def to_device(self, target_device, /, *, stream=None): + """ to_device(target_device, /, *, stream=None) + + Transfers this array to specified target device. + + :Example: + .. code-block:: python + + import dpctl + import dpnp.tensor as dpt + + x = dpt.full(10**6, 2, dtype="int64") + q_prof = dpctl.SyclQueue( + x.sycl_device, property="enable_profiling") + # return a view with profile-enabled queue + y = x.to_device(q_prof) + timer = dpctl.SyclTimer() + with timer(q_prof): + z = y * y + print(timer.dt) + + Args: + target_device (object): + Array API concept of target device. + It can be a oneAPI filter selector string, + an instance of :class:`dpctl.SyclDevice` corresponding to a + non-partitioned SYCL device, an instance of + :class:`dpctl.SyclQueue`, or a :class:`dpctl.tensor.Device` + object returned by :attr:`dpctl.tensor.usm_ndarray.device`. + stream (:class:`dpctl.SyclQueue`, optional): + Execution queue to synchronize with. If ``None``, + synchronization is not performed. + + Returns: + usm_ndarray: + A view if data copy is not required, and a copy otherwise. + If copying is required, it is done by copying from the original + allocation device to the host, followed by copying from host + to the target device. + """ + cdef c_dpctl.DPCTLSyclQueueRef QRef = NULL + cdef c_dpmem._Memory arr_buf + d = Device.create_device(target_device) + + _validate_and_use_stream(stream, self.sycl_queue) + + if (d.sycl_context == self.sycl_context): + arr_buf = self.usm_data + QRef = ( d.sycl_queue).get_queue_ref() + view_buffer = c_dpmem._Memory.create_from_usm_pointer_size_qref( + arr_buf.get_data_ptr(), + arr_buf.nbytes, + QRef, + memory_owner=arr_buf + ) + res = usm_ndarray( + self.shape, + self.dtype, + buffer=view_buffer, + strides=self.strides, + offset=self.get_offset() + ) + res.flags_ = self.flags_ + return res + else: + nbytes = self.usm_data.nbytes + copy_buffer = type(self.usm_data)( + nbytes, queue=d.sycl_queue + ) + copy_buffer.copy_from_device(self.usm_data) + res = usm_ndarray( + self.shape, + self.dtype, + buffer=copy_buffer, + strides=self.strides, + offset=self.get_offset() + ) + res.flags_ = self.flags_ + return res + + def _set_namespace(self, mod): + """ Sets array namespace to given module `mod`. """ + self.array_namespace_ = mod + + def __array_namespace__(self, api_version=None): + """ + Returns array namespace, member functions of which + implement data API. + + Args: + api_version (str, optional) + Request namespace compliant with given version of + array API. If ``None``, namespace for the most + recent supported version is returned. + Default: ``None``. + """ + if api_version is not None: + from ._array_api import __array_api_version__ + if not isinstance(api_version, str): + raise TypeError(f"Expected type str, got {type(api_version)}") + if api_version != __array_api_version__: + raise ValueError(f"Only {__array_api_version__} is supported") + return ( + self.array_namespace_ + if self.array_namespace_ is not None + else dpnp.tensor + ) + + def __bool__(self): + if self.size == 1: + _check_0d_scalar_conversion(self) + view = _as_zero_dim_ndarray(self) + return view.__bool__() + + if self.size == 0: + raise ValueError( + "The truth value of an empty array is ambiguous" + ) + + raise ValueError( + "The truth value of an array with more than one element is " + "ambiguous. Use dpnp.tensor.any() or dpnp.tensor.all()" + ) + + def __float__(self): + if self.size == 1: + _check_0d_scalar_conversion(self) + view = _as_zero_dim_ndarray(self) + return view.__float__() + + raise ValueError( + "only size-1 arrays can be converted to Python scalars" + ) + + def __complex__(self): + if self.size == 1: + _check_0d_scalar_conversion(self) + view = _as_zero_dim_ndarray(self) + return view.__complex__() + + raise ValueError( + "only size-1 arrays can be converted to Python scalars" + ) + + def __int__(self): + if self.size == 1: + _check_0d_scalar_conversion(self) + view = _as_zero_dim_ndarray(self) + return view.__int__() + + raise ValueError( + "only size-1 arrays can be converted to Python scalars" + ) + + def __index__(self): + if np.issubdtype(self.dtype, np.integer): + return int(self) + + raise IndexError("only integer arrays are valid indices") + + def __abs__(self): + return dpnp.tensor.abs(self) + + def __add__(self, other): + """ + Implementation for operator.add + """ + return dpnp.tensor.add(self, other) + + def __and__(self, other): + "Implementation for operator.and" + return dpnp.tensor.bitwise_and(self, other) + + def __dlpack__( + self, *, stream=None, max_version=None, dl_device=None, copy=None + ): + """ + Produces DLPack capsule. + + Args: + stream (:class:`dpctl.SyclQueue`, optional): + Execution queue to synchronize with. + If ``None``, synchronization is not performed. + Default: ``None``. + max_version (tuple[int, int], optional): + The maximum DLPack version the consumer (caller of + ``__dlpack__``) supports. As ``__dlpack__`` may not + always return a DLPack capsule with version + `max_version`, the consumer must verify the version + even if this argument is passed. + Default: ``None``. + dl_device (tuple[enum.Enum, int], optional): + The device the returned DLPack capsule will be + placed on. + The device must be a 2-tuple matching the format of + ``__dlpack_device__`` method, an integer enumerator + representing the device type followed by an integer + representing the index of the device. + Default: ``None``. + copy (bool, optional): + Boolean indicating whether or not to copy the input. + + * If ``copy`` is ``True``, the input will always be + copied. + * If ``False``, a ``BufferError`` will be raised if a + copy is deemed necessary. + * If ``None``, a copy will be made only if deemed + necessary, otherwise, the existing memory buffer will + be reused. + + Default: ``None``. + + Raises: + MemoryError: + when host memory can not be allocated. + DLPackCreationError: + when array is allocated on a partitioned + SYCL device, or with a non-default context. + BufferError: + when a copy is deemed necessary but ``copy`` + is ``False`` or when the provided ``dl_device`` + cannot be handled. + """ + if max_version is None: + # legacy path for DLManagedTensor + # copy kwarg ignored because copy flag can't be set + _caps = c_dlpack.to_dlpack_capsule(self) + _validate_and_use_stream(stream, self.sycl_queue) + return _caps + else: + if not isinstance(max_version, tuple) or len(max_version) != 2: + raise TypeError( + "`__dlpack__` expects `max_version` to be a " + "2-tuple of integers `(major, minor)`, instead " + f"got {max_version}" + ) + dpctl_dlpack_version = get_build_dlpack_version() + if max_version[0] >= dpctl_dlpack_version[0]: + # DLManagedTensorVersioned path + if dl_device is not None: + if not isinstance(dl_device, tuple) or len(dl_device) != 2: + raise TypeError( + "`__dlpack__` expects `dl_device` to be a 2-tuple " + "of `(device_type, device_id)`, instead " + f"got {dl_device}" + ) + if dl_device != self.__dlpack_device__(): + if copy is False: + raise BufferError( + "array cannot be placed on the requested " + "device without a copy" + ) + if _is_host_cpu(dl_device): + if stream is not None: + raise ValueError( + "`stream` must be `None` when `dl_device` " + "is of type `kDLCPU`" + ) + from ._copy_utils import _copy_to_numpy + _arr = _copy_to_numpy(self) + _arr.flags["W"] = self.flags["W"] + return c_dlpack.numpy_to_dlpack_versioned_capsule( + _arr, True + ) + else: + raise BufferError( + f"targeting `dl_device` {dl_device} with " + "`__dlpack__` is not yet implemented" + ) + if copy is None: + copy = False + # TODO: strategy for handling stream on different device + # from dl_device + if copy: + _validate_and_use_stream(stream, self.sycl_queue) + nbytes = self.usm_data.nbytes + copy_buffer = type(self.usm_data)( + nbytes, queue=self.sycl_queue + ) + copy_buffer.copy_from_device(self.usm_data) + _copied_arr = usm_ndarray( + self.shape, + self.dtype, + buffer=copy_buffer, + strides=self.strides, + offset=self.get_offset() + ) + _copied_arr.flags_ = self.flags_ + _caps = c_dlpack.to_dlpack_versioned_capsule( + _copied_arr, copy + ) + else: + _caps = c_dlpack.to_dlpack_versioned_capsule(self, copy) + _validate_and_use_stream(stream, self.sycl_queue) + return _caps + else: + # legacy path for DLManagedTensor + _caps = c_dlpack.to_dlpack_capsule(self) + _validate_and_use_stream(stream, self.sycl_queue) + return _caps + + def __dlpack_device__(self): + """ + Gives a tuple (``device_type``, ``device_id``) corresponding to + ``DLDevice`` entry in ``DLTensor`` in DLPack protocol. + + The tuple describes the non-partitioned device where the array has been + allocated, or the non-partitioned parent device of the allocation + device. + + See :class:`dpctl.tensor.DLDeviceType` for a list of devices supported + by the DLPack protocol. + + Raises: + DLPackCreationError: + when the ``device_id`` could not be determined. + """ + try: + dev_id = self.sycl_device.get_device_id() + except ValueError as e: + raise c_dlpack.DLPackCreationError( + "Could not determine id of the device where array was " + "allocated." + ) + return ( + DLDeviceType.kDLOneAPI, + dev_id, + ) + + def __eq__(self, other): + return dpnp.tensor.equal(self, other) + + def __floordiv__(self, other): + return dpnp.tensor.floor_divide(self, other) + + def __ge__(self, other): + return dpnp.tensor.greater_equal(self, other) + + def __gt__(self, other): + return dpnp.tensor.greater(self, other) + + def __invert__(self): + return dpnp.tensor.bitwise_invert(self) + + def __le__(self, other): + return dpnp.tensor.less_equal(self, other) + + def __len__(self): + if (self.nd_): + return self.shape[0] + else: + raise TypeError("len() of unsized object") + + def __lshift__(self, other): + return dpnp.tensor.bitwise_left_shift(self, other) + + def __lt__(self, other): + return dpnp.tensor.less(self, other) + + def __matmul__(self, other): + return dpnp.tensor.matmul(self, other) + + def __mod__(self, other): + return dpnp.tensor.remainder(self, other) + + def __mul__(self, other): + return dpnp.tensor.multiply(self, other) + + def __ne__(self, other): + return dpnp.tensor.not_equal(self, other) + + def __neg__(self): + return dpnp.tensor.negative(self) + + def __or__(self, other): + return dpnp.tensor.bitwise_or(self, other) + + def __pos__(self): + return dpnp.tensor.positive(self) + + def __pow__(self, other): + return dpnp.tensor.pow(self, other) + + def __rshift__(self, other): + return dpnp.tensor.bitwise_right_shift(self, other) + + def __setitem__(self, key, rhs): + cdef tuple _meta + cdef usm_ndarray Xv + + if (self.flags_ & USM_ARRAY_WRITABLE) == 0: + raise ValueError("Can not modify read-only array.") + + _meta = _basic_slice_meta( + key, (self).shape, ( self).strides, + self.get_offset() + ) + + if len(_meta) < 5: + raise RuntimeError + + Xv = usm_ndarray.__new__( + usm_ndarray, + _meta[0], + dtype=_make_typestr(self.typenum_), + strides=_meta[1], + buffer=self.base_, + offset=_meta[2], + ) + # set namespace + Xv.array_namespace_ = self.array_namespace_ + + from ._copy_utils import ( + _copy_from_numpy_into, + _copy_from_usm_ndarray_to_usm_ndarray, + _nonzero_impl, + _place_impl, + _put_multi_index, + ) + + adv_ind = _meta[3] + adv_ind_start_p = _meta[4] + + if adv_ind_start_p < 0: + # basic slicing + if isinstance(rhs, usm_ndarray): + _copy_from_usm_ndarray_to_usm_ndarray(Xv, rhs) + else: + if hasattr(rhs, "__sycl_usm_array_interface__"): + from dpnp.tensor import asarray + try: + rhs_ar = asarray(rhs) + _copy_from_usm_ndarray_to_usm_ndarray(Xv, rhs_ar) + except Exception: + raise ValueError( + f"Input of type {type(rhs)} could not be " + "converted to usm_ndarray" + ) + else: + rhs_np = np.asarray(rhs) + if type_bytesize(rhs_np.dtype.num) < 0: + raise ValueError( + f"Input of type {type(rhs)} can not be " + "assigned to usm_ndarray because of " + f"unsupported data type '{rhs_np.dtype}'" + ) + try: + _copy_from_numpy_into(Xv, rhs_np) + except Exception: + raise ValueError( + f"Input of type {type(rhs)} could not be " + "copied into dpnp.tensor.usm_ndarray" + ) + return + + if len(adv_ind) == 1 and adv_ind[0].dtype == dpt_bool: + _place_impl(Xv, adv_ind[0], rhs, axis=adv_ind_start_p) + return + + if any( + ( + isinstance(ind, usm_ndarray) and ind.dtype == dpt_bool + ) for ind in adv_ind + ): + adv_ind_int = list() + for ind in adv_ind: + if isinstance(ind, usm_ndarray) and ind.dtype == dpt_bool: + adv_ind_int.extend(_nonzero_impl(ind)) + else: + adv_ind_int.append(ind) + _put_multi_index(Xv, tuple(adv_ind_int), adv_ind_start_p, rhs) + return + + _put_multi_index(Xv, adv_ind, adv_ind_start_p, rhs) + return + + def __sub__(self, other): + return dpnp.tensor.subtract(self, other) + + def __truediv__(self, other): + return dpnp.tensor.divide(self, other) + + def __xor__(self, other): + return dpnp.tensor.bitwise_xor(self, other) + + def __radd__(self, other): + return dpnp.tensor.add(other, self) + + def __rand__(self, other): + return dpnp.tensor.bitwise_and(other, self) + + def __rfloordiv__(self, other): + return dpnp.tensor.floor_divide(other, self) + + def __rlshift__(self, other): + return dpnp.tensor.bitwise_left_shift(other, self) + + def __rmatmul__(self, other): + return dpnp.tensor.matmul(other, self) + + def __rmod__(self, other): + return dpnp.tensor.remainder(other, self) + + def __rmul__(self, other): + return dpnp.tensor.multiply(other, self) + + def __ror__(self, other): + return dpnp.tensor.bitwise_or(other, self) + + def __rpow__(self, other): + return dpnp.tensor.pow(other, self) + + def __rrshift__(self, other): + return dpnp.tensor.bitwise_right_shift(other, self) + + def __rsub__(self, other): + return dpnp.tensor.subtract(other, self) + + def __rtruediv__(self, other): + return dpnp.tensor.divide(other, self) + + def __rxor__(self, other): + return dpnp.tensor.bitwise_xor(other, self) + + def __iadd__(self, other): + return dpnp.tensor.add._inplace_op(self, other) + + def __iand__(self, other): + return dpnp.tensor.bitwise_and._inplace_op(self, other) + + def __ifloordiv__(self, other): + return dpnp.tensor.floor_divide._inplace_op(self, other) + + def __ilshift__(self, other): + return dpnp.tensor.bitwise_left_shift._inplace_op(self, other) + + def __imatmul__(self, other): + return dpnp.tensor.matmul(self, other, out=self, dtype=self.dtype) + + def __imod__(self, other): + return dpnp.tensor.remainder._inplace_op(self, other) + + def __imul__(self, other): + return dpnp.tensor.multiply._inplace_op(self, other) + + def __ior__(self, other): + return dpnp.tensor.bitwise_or._inplace_op(self, other) + + def __ipow__(self, other): + return dpnp.tensor.pow._inplace_op(self, other) + + def __irshift__(self, other): + return dpnp.tensor.bitwise_right_shift._inplace_op(self, other) + + def __isub__(self, other): + return dpnp.tensor.subtract._inplace_op(self, other) + + def __itruediv__(self, other): + return dpnp.tensor.divide._inplace_op(self, other) + + def __ixor__(self, other): + return dpnp.tensor.bitwise_xor._inplace_op(self, other) + + def __str__(self): + return usm_ndarray_str(self) + + def __repr__(self): + return usm_ndarray_repr(self) + + def __array__(self, dtype=None, /, *, copy=None): + """NumPy's array protocol method to disallow implicit conversion. + + Without this definition, `numpy.asarray(usm_ar)` converts + usm_ndarray instance into NumPy array with data type `object` + and every element being 0d usm_ndarray. + + https://github.com/IntelPython/dpctl/pull/1384#issuecomment-1707212972 + """ + raise TypeError( + "Implicit conversion to a NumPy array is not allowed. " + "Use `dpnp.tensor.asnumpy` to copy data from this " + "`dpnp.tensor.usm_ndarray` instance to NumPy array" + ) + + +cdef usm_ndarray _real_view(usm_ndarray ary): + """ + View into real parts of a complex type array + """ + cdef int r_typenum_ = -1 + cdef usm_ndarray r = None + cdef Py_ssize_t offset_elems = 0 + + if (ary.typenum_ == UAR_CFLOAT): + r_typenum_ = UAR_FLOAT + elif (ary.typenum_ == UAR_CDOUBLE): + r_typenum_ = UAR_DOUBLE + else: + raise InternalUSMArrayError( + "_real_view call on array of non-complex type.") + + offset_elems = ary.get_offset() * 2 + r = usm_ndarray.__new__( + usm_ndarray, + _make_int_tuple(ary.nd_, ary.shape_) if ary.nd_ > 0 else tuple(), + dtype=_make_typestr(r_typenum_), + strides=tuple(2 * si for si in ary.strides), + buffer=ary.base_, + offset=offset_elems, + order=("C" if (ary.flags_ & USM_ARRAY_C_CONTIGUOUS) else "F") + ) + r.flags_ = _copy_writable(r.flags_, ary.flags_) + r.array_namespace_ = ary.array_namespace_ + return r + + +cdef usm_ndarray _imag_view(usm_ndarray ary): + """ + View into imaginary parts of a complex type array + """ + cdef int r_typenum_ = -1 + cdef usm_ndarray r = None + cdef Py_ssize_t offset_elems = 0 + + if (ary.typenum_ == UAR_CFLOAT): + r_typenum_ = UAR_FLOAT + elif (ary.typenum_ == UAR_CDOUBLE): + r_typenum_ = UAR_DOUBLE + else: + raise InternalUSMArrayError( + "_imag_view call on array of non-complex type.") + + # displace pointer to imaginary part + offset_elems = 2 * ary.get_offset() + 1 + r = usm_ndarray.__new__( + usm_ndarray, + _make_int_tuple(ary.nd_, ary.shape_) if ary.nd_ > 0 else tuple(), + dtype=_make_typestr(r_typenum_), + strides=tuple(2 * si for si in ary.strides), + buffer=ary.base_, + offset=offset_elems, + order=("C" if (ary.flags_ & USM_ARRAY_C_CONTIGUOUS) else "F") + ) + r.flags_ = _copy_writable(r.flags_, ary.flags_) + r.array_namespace_ = ary.array_namespace_ + return r + + +cdef usm_ndarray _transpose(usm_ndarray ary): + """ + Construct transposed array without copying the data + """ + cdef usm_ndarray r = usm_ndarray.__new__( + usm_ndarray, + _make_reversed_int_tuple(ary.nd_, ary.shape_), + dtype=_make_typestr(ary.typenum_), + strides=( + _make_reversed_int_tuple(ary.nd_, ary.strides_) + if (ary.strides_) else None), + buffer=ary.base_, + order=("F" if (ary.flags_ & USM_ARRAY_C_CONTIGUOUS) else "C"), + offset=ary.get_offset() + ) + r.flags_ = _copy_writable(r.flags_, ary.flags_) + return r + + +cdef usm_ndarray _m_transpose(usm_ndarray ary): + """ + Construct matrix transposed array + """ + cdef usm_ndarray r = usm_ndarray.__new__( + usm_ndarray, + _swap_last_two(_make_int_tuple(ary.nd_, ary.shape_)), + dtype=_make_typestr(ary.typenum_), + strides=_swap_last_two(ary.strides), + buffer=ary.base_, + order=("F" if (ary.flags_ & USM_ARRAY_C_CONTIGUOUS) else "C"), + offset=ary.get_offset() + ) + r.flags_ = _copy_writable(r.flags_, ary.flags_) + return r + + +cdef usm_ndarray _zero_like(usm_ndarray ary): + """ + Make C-contiguous array of zero elements with same shape, + type, device, and sycl_queue as ary. + """ + cdef dt = _make_typestr(ary.typenum_) + cdef usm_ndarray r = usm_ndarray( + _make_int_tuple(ary.nd_, ary.shape_) if ary.nd_ > 0 else tuple(), + dtype=dt, + buffer=ary.base_.get_usm_type(), + buffer_ctor_kwargs={"queue": ary.get_sycl_queue()}, + ) + r.base_.memset() + return r + + +def _is_object_with_buffer_protocol(o): + "Returns True if object supports Python buffer protocol" + return _is_buffer(o) diff --git a/dpnp/tensor/_utility_functions.py b/dpnp/tensor/_utility_functions.py new file mode 100644 index 000000000000..651ce0830266 --- /dev/null +++ b/dpnp/tensor/_utility_functions.py @@ -0,0 +1,506 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import builtins +import operator + +import dpctl.utils as du + +import dpnp.tensor as dpt +import dpnp.tensor._tensor_impl as ti +import dpnp.tensor._tensor_reductions_impl as tri + +from ._numpy_helper import normalize_axis_index, normalize_axis_tuple +from ._scalar_utils import ( + _get_dtype, + _get_queue_usm_type, + _get_shape, + _validate_dtype, +) +from ._type_utils import ( + _resolve_one_strong_one_weak_types, + _resolve_one_strong_two_weak_types, +) + + +def _boolean_reduction(x, axis, keepdims, func): + if not isinstance(x, dpt.usm_ndarray): + raise TypeError(f"Expected dpnp.tensor.usm_ndarray, got {type(x)}") + + nd = x.ndim + if axis is None: + red_nd = nd + # case of a scalar + if red_nd == 0: + return dpt.astype(x, dpt.bool) + x_tmp = x + res_shape = () + perm = list(range(nd)) + else: + if not isinstance(axis, (tuple, list)): + axis = (axis,) + axis = normalize_axis_tuple(axis, nd, "axis") + + red_nd = len(axis) + # check for axis=() + if red_nd == 0: + return dpt.astype(x, dpt.bool) + perm = [i for i in range(nd) if i not in axis] + list(axis) + x_tmp = dpt.permute_dims(x, perm) + res_shape = x_tmp.shape[: nd - red_nd] + + exec_q = x.sycl_queue + res_usm_type = x.usm_type + + _manager = du.SequentialOrderManager[exec_q] + dep_evs = _manager.submitted_events + # always allocate the temporary as + # int32 and usm-device to ensure that atomic updates + # are supported + res_tmp = dpt.empty( + res_shape, + dtype=dpt.int32, + usm_type="device", + sycl_queue=exec_q, + ) + hev0, ev0 = func( + src=x_tmp, + trailing_dims_to_reduce=red_nd, + dst=res_tmp, + sycl_queue=exec_q, + depends=dep_evs, + ) + _manager.add_event_pair(hev0, ev0) + + # copy to boolean result array + res = dpt.empty( + res_shape, + dtype=dpt.bool, + usm_type=res_usm_type, + sycl_queue=exec_q, + ) + hev1, ev1 = ti._copy_usm_ndarray_into_usm_ndarray( + src=res_tmp, dst=res, sycl_queue=exec_q, depends=[ev0] + ) + _manager.add_event_pair(hev1, ev1) + + if keepdims: + res_shape = res_shape + (1,) * red_nd + inv_perm = sorted(range(nd), key=lambda d: perm[d]) + res = dpt.permute_dims(dpt.reshape(res, res_shape), inv_perm) + return res + + +def all(x, /, *, axis=None, keepdims=False): + """ + all(x, axis=None, keepdims=False) + + Tests whether all input array elements evaluate to True along a given axis. + + Args: + x (usm_ndarray): Input array. + axis (Optional[Union[int, Tuple[int,...]]]): Axis (or axes) + along which to perform a logical AND reduction. + When `axis` is `None`, a logical AND reduction + is performed over all dimensions of `x`. + If `axis` is negative, the axis is counted from + the last dimension to the first. + Default: `None`. + keepdims (bool, optional): If `True`, the reduced axes are included + in the result as singleton dimensions, and the result is + broadcastable to the input array shape. + If `False`, the reduced axes are not included in the result. + Default: `False`. + + Returns: + usm_ndarray: + An array with a data type of `bool` + containing the results of the logical AND reduction. + """ + return _boolean_reduction(x, axis, keepdims, tri._all) + + +def any(x, /, *, axis=None, keepdims=False): + """ + any(x, axis=None, keepdims=False) + + Tests whether any input array elements evaluate to True along a given axis. + + Args: + x (usm_ndarray): Input array. + axis (Optional[Union[int, Tuple[int,...]]]): Axis (or axes) + along which to perform a logical OR reduction. + When `axis` is `None`, a logical OR reduction + is performed over all dimensions of `x`. + If `axis` is negative, the axis is counted from + the last dimension to the first. + Default: `None`. + keepdims (bool, optional): If `True`, the reduced axes are included + in the result as singleton dimensions, and the result is + broadcastable to the input array shape. + If `False`, the reduced axes are not included in the result. + Default: `False`. + + Returns: + usm_ndarray: + An array with a data type of `bool` + containing the results of the logical OR reduction. + """ + return _boolean_reduction(x, axis, keepdims, tri._any) + + +def _validate_diff_shape(sh1, sh2, axis): + """ + Utility for validating that two shapes `sh1` and `sh2` + are possible to concatenate along `axis`. + """ + if not sh2: + # scalars will always be accepted + return True + else: + sh1_ndim = len(sh1) + if sh1_ndim == len(sh2) and builtins.all( + sh1[i] == sh2[i] for i in range(sh1_ndim) if i != axis + ): + return True + else: + return False + + +def _concat_diff_input(arr, axis, prepend, append): + """ + Concatenates `arr`, `prepend` and, `append` along `axis`, + where `arr` is an array and `prepend` and `append` are + any mixture of arrays and scalars. + """ + if prepend is not None and append is not None: + q1, x_usm_type = arr.sycl_queue, arr.usm_type + q2, prepend_usm_type = _get_queue_usm_type(prepend) + q3, append_usm_type = _get_queue_usm_type(append) + if q2 is None and q3 is None: + exec_q = q1 + coerced_usm_type = x_usm_type + elif q3 is None: + exec_q = dpt.get_execution_queue((q1, q2)) + if exec_q is None: + raise dpt.ExecutionPlacementError( + "Execution placement can not be unambiguously inferred " + "from input arguments." + ) + coerced_usm_type = dpt.get_coerced_usm_type( + ( + x_usm_type, + prepend_usm_type, + ) + ) + elif q2 is None: + exec_q = dpt.get_execution_queue((q1, q3)) + if exec_q is None: + raise dpt.ExecutionPlacementError( + "Execution placement can not be unambiguously inferred " + "from input arguments." + ) + coerced_usm_type = dpt.get_coerced_usm_type( + ( + x_usm_type, + append_usm_type, + ) + ) + else: + exec_q = dpt.get_execution_queue((q1, q2, q3)) + if exec_q is None: + raise dpt.ExecutionPlacementError( + "Execution placement can not be unambiguously inferred " + "from input arguments." + ) + coerced_usm_type = dpt.get_coerced_usm_type( + ( + x_usm_type, + prepend_usm_type, + append_usm_type, + ) + ) + dpt.validate_usm_type(coerced_usm_type, allow_none=False) + arr_shape = arr.shape + prepend_shape = _get_shape(prepend) + append_shape = _get_shape(append) + if not builtins.all( + isinstance(s, (tuple, list)) + for s in ( + prepend_shape, + append_shape, + ) + ): + raise TypeError( + "Shape of arguments can not be inferred. " + "Arguments are expected to be " + "lists, tuples, or both" + ) + valid_prepend_shape = _validate_diff_shape( + arr_shape, prepend_shape, axis + ) + if not valid_prepend_shape: + raise ValueError( + f"`diff` argument `prepend` with shape {prepend_shape} is " + f"invalid for first input with shape {arr_shape}" + ) + valid_append_shape = _validate_diff_shape(arr_shape, append_shape, axis) + if not valid_append_shape: + raise ValueError( + f"`diff` argument `append` with shape {append_shape} is invalid" + f" for first input with shape {arr_shape}" + ) + sycl_dev = exec_q.sycl_device + arr_dtype = arr.dtype + prepend_dtype = _get_dtype(prepend, sycl_dev) + append_dtype = _get_dtype(append, sycl_dev) + if not builtins.all( + _validate_dtype(o) for o in (prepend_dtype, append_dtype) + ): + raise ValueError("Operands have unsupported data types") + prepend_dtype, append_dtype = _resolve_one_strong_two_weak_types( + arr_dtype, prepend_dtype, append_dtype, sycl_dev + ) + if isinstance(prepend, dpt.usm_ndarray): + a_prepend = prepend + else: + a_prepend = dpt.asarray( + prepend, + dtype=prepend_dtype, + usm_type=coerced_usm_type, + sycl_queue=exec_q, + ) + if isinstance(append, dpt.usm_ndarray): + a_append = append + else: + a_append = dpt.asarray( + append, + dtype=append_dtype, + usm_type=coerced_usm_type, + sycl_queue=exec_q, + ) + if not prepend_shape: + prepend_shape = arr_shape[:axis] + (1,) + arr_shape[axis + 1 :] + a_prepend = dpt.broadcast_to(a_prepend, prepend_shape) + if not append_shape: + append_shape = arr_shape[:axis] + (1,) + arr_shape[axis + 1 :] + a_append = dpt.broadcast_to(a_append, append_shape) + return dpt.concat((a_prepend, arr, a_append), axis=axis) + elif prepend is not None: + q1, x_usm_type = arr.sycl_queue, arr.usm_type + q2, prepend_usm_type = _get_queue_usm_type(prepend) + if q2 is None: + exec_q = q1 + coerced_usm_type = x_usm_type + else: + exec_q = dpt.get_execution_queue((q1, q2)) + if exec_q is None: + raise dpt.ExecutionPlacementError( + "Execution placement can not be unambiguously inferred " + "from input arguments." + ) + coerced_usm_type = dpt.get_coerced_usm_type( + ( + x_usm_type, + prepend_usm_type, + ) + ) + dpt.validate_usm_type(coerced_usm_type, allow_none=False) + arr_shape = arr.shape + prepend_shape = _get_shape(prepend) + if not isinstance(prepend_shape, (tuple, list)): + raise TypeError( + "Shape of argument can not be inferred. " + "Argument is expected to be a " + "list or tuple" + ) + valid_prepend_shape = _validate_diff_shape( + arr_shape, prepend_shape, axis + ) + if not valid_prepend_shape: + raise ValueError( + f"`diff` argument `prepend` with shape {prepend_shape} is " + f"invalid for first input with shape {arr_shape}" + ) + sycl_dev = exec_q.sycl_device + arr_dtype = arr.dtype + prepend_dtype = _get_dtype(prepend, sycl_dev) + if not _validate_dtype(prepend_dtype): + raise ValueError("Operand has unsupported data type") + prepend_dtype = _resolve_one_strong_one_weak_types( + arr_dtype, prepend_dtype, sycl_dev + ) + if isinstance(prepend, dpt.usm_ndarray): + a_prepend = prepend + else: + a_prepend = dpt.asarray( + prepend, + dtype=prepend_dtype, + usm_type=coerced_usm_type, + sycl_queue=exec_q, + ) + if not prepend_shape: + prepend_shape = arr_shape[:axis] + (1,) + arr_shape[axis + 1 :] + a_prepend = dpt.broadcast_to(a_prepend, prepend_shape) + return dpt.concat((a_prepend, arr), axis=axis) + elif append is not None: + q1, x_usm_type = arr.sycl_queue, arr.usm_type + q2, append_usm_type = _get_queue_usm_type(append) + if q2 is None: + exec_q = q1 + coerced_usm_type = x_usm_type + else: + exec_q = dpt.get_execution_queue((q1, q2)) + if exec_q is None: + raise dpt.ExecutionPlacementError( + "Execution placement can not be unambiguously inferred " + "from input arguments." + ) + coerced_usm_type = dpt.get_coerced_usm_type( + ( + x_usm_type, + append_usm_type, + ) + ) + dpt.validate_usm_type(coerced_usm_type, allow_none=False) + arr_shape = arr.shape + append_shape = _get_shape(append) + if not isinstance(append_shape, (tuple, list)): + raise TypeError( + "Shape of argument can not be inferred. " + "Argument is expected to be a " + "list or tuple" + ) + valid_append_shape = _validate_diff_shape(arr_shape, append_shape, axis) + if not valid_append_shape: + raise ValueError( + f"`diff` argument `append` with shape {append_shape} is invalid" + f" for first input with shape {arr_shape}" + ) + sycl_dev = exec_q.sycl_device + arr_dtype = arr.dtype + append_dtype = _get_dtype(append, sycl_dev) + if not _validate_dtype(append_dtype): + raise ValueError("Operand has unsupported data type") + append_dtype = _resolve_one_strong_one_weak_types( + arr_dtype, append_dtype, sycl_dev + ) + if isinstance(append, dpt.usm_ndarray): + a_append = append + else: + a_append = dpt.asarray( + append, + dtype=append_dtype, + usm_type=coerced_usm_type, + sycl_queue=exec_q, + ) + if not append_shape: + append_shape = arr_shape[:axis] + (1,) + arr_shape[axis + 1 :] + a_append = dpt.broadcast_to(a_append, append_shape) + return dpt.concat((arr, a_append), axis=axis) + else: + arr1 = arr + return arr1 + + +def diff(x, /, *, axis=-1, n=1, prepend=None, append=None): + """ + Calculates the `n`-th discrete forward difference of `x` along `axis`. + + Args: + x (usm_ndarray): + input array. + axis (int): + axis along which to compute the difference. A valid axis must be on + the interval `[-N, N)`, where `N` is the rank (number of + dimensions) of `x`. + Default: `-1` + n (int): + number of times to recursively compute the difference. + Default: `1`. + prepend (Union[usm_ndarray, bool, int, float, complex]): + value or values to prepend to the specified axis before taking the + difference. + Must have the same shape as `x` except along `axis`, which can have + any shape. + Default: `None`. + append (Union[usm_ndarray, bool, int, float, complex]): + value or values to append to the specified axis before taking the + difference. + Must have the same shape as `x` except along `axis`, which can have + any shape. + Default: `None`. + + Returns: + usm_ndarray: + an array containing the `n`-th differences. The array will have the + same shape as `x`, except along `axis`, which will have shape: + ``prepend.shape[axis] + x.shape[axis] + append.shape[axis] - n`` + + The data type of the returned array is determined by the Type + Promotion Rules. + """ + + if not isinstance(x, dpt.usm_ndarray): + raise TypeError( + "Expecting dpnp.tensor.usm_ndarray type, " f"got {type(x)}" + ) + x_nd = x.ndim + axis = normalize_axis_index(operator.index(axis), x_nd) + n = operator.index(n) + if n < 0: + raise ValueError(f"`n` must be positive, got {n}") + arr = _concat_diff_input(x, axis, prepend, append) + if n == 0: + return arr + # form slices and recurse + sl0 = tuple( + slice(None) if i != axis else slice(1, None) for i in range(x_nd) + ) + sl1 = tuple( + slice(None) if i != axis else slice(None, -1) for i in range(x_nd) + ) + + diff_op = dpt.not_equal if x.dtype == dpt.bool else dpt.subtract + if n > 1: + arr_tmp0 = diff_op(arr[sl0], arr[sl1]) + arr_tmp1 = diff_op(arr_tmp0[sl0], arr_tmp0[sl1]) + n = n - 2 + if n > 0: + sl3 = tuple( + slice(None) if i != axis else slice(None, -2) + for i in range(x_nd) + ) + for _ in range(n): + arr_tmp0_sliced = arr_tmp0[sl3] + diff_op(arr_tmp1[sl0], arr_tmp1[sl1], out=arr_tmp0_sliced) + arr_tmp0, arr_tmp1 = arr_tmp1, arr_tmp0_sliced + arr = arr_tmp1 + else: + arr = diff_op(arr[sl0], arr[sl1]) + return arr diff --git a/dpnp/tensor/include/dlpack/LICENSE.third-party b/dpnp/tensor/include/dlpack/LICENSE.third-party new file mode 100644 index 000000000000..20a9c8a7b4dc --- /dev/null +++ b/dpnp/tensor/include/dlpack/LICENSE.third-party @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "{}" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright 2017 by Contributors + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/dpnp/tensor/include/dlpack/README.md b/dpnp/tensor/include/dlpack/README.md new file mode 100644 index 000000000000..315ad1b9a566 --- /dev/null +++ b/dpnp/tensor/include/dlpack/README.md @@ -0,0 +1,7 @@ +# DLPack header + +The header `dlpack.h` downloaded from `https://github.com/dmlc/dlpack.git` remote at tag v1.3 commit [`84d107b`](https://github.com/dmlc/dlpack/commit/84d107bf416c6bab9ae68ad285876600d230490d). + +The file can also be viewed using github web interface at https://github.com/dmlc/dlpack/blob/v1.3/include/dlpack/dlpack.h + +License file was retrieved from https://github.com/dmlc/dlpack/blob/main/LICENSE diff --git a/dpnp/tensor/include/dlpack/dlpack.h b/dpnp/tensor/include/dlpack/dlpack.h new file mode 100644 index 000000000000..5196acc87711 --- /dev/null +++ b/dpnp/tensor/include/dlpack/dlpack.h @@ -0,0 +1,683 @@ +/*! + * Copyright (c) 2017 - by Contributors + * \file dlpack.h + * \brief The common header of DLPack. + */ +#ifndef DLPACK_DLPACK_H_ +#define DLPACK_DLPACK_H_ + +/** + * \brief Compatibility with C++ + */ +#ifdef __cplusplus +#define DLPACK_EXTERN_C extern "C" +#else +#define DLPACK_EXTERN_C +#endif + +/*! \brief The current major version of dlpack */ +#define DLPACK_MAJOR_VERSION 1 + +/*! \brief The current minor version of dlpack */ +#define DLPACK_MINOR_VERSION 3 + +/*! \brief DLPACK_DLL prefix for windows */ +#ifdef _WIN32 +#ifdef DLPACK_EXPORTS +#define DLPACK_DLL __declspec(dllexport) +#else +#define DLPACK_DLL __declspec(dllimport) +#endif +#else +#define DLPACK_DLL +#endif + +#include +#include + +#ifdef __cplusplus +extern "C" +{ +#endif + + /*! + * \brief The DLPack version. + * + * A change in major version indicates that we have changed the + * data layout of the ABI - DLManagedTensorVersioned. + * + * A change in minor version indicates that we have added new + * code, such as a new device type, but the ABI is kept the same. + * + * If an obtained DLPack tensor has a major version that disagrees + * with the version number specified in this header file + * (i.e. major != DLPACK_MAJOR_VERSION), the consumer must call the deleter + * (and it is safe to do so). It is not safe to access any other fields + * as the memory layout will have changed. + * + * In the case of a minor version mismatch, the tensor can be safely used as + * long as the consumer knows how to interpret all fields. Minor version + * updates indicate the addition of enumeration values. + */ + typedef struct + { + /*! \brief DLPack major version. */ + uint32_t major; + /*! \brief DLPack minor version. */ + uint32_t minor; + } DLPackVersion; + +/*! + * \brief The device type in DLDevice. + */ +#ifdef __cplusplus + typedef enum : int32_t + { +#else +typedef enum +{ +#endif + /*! \brief CPU device */ + kDLCPU = 1, + /*! \brief CUDA GPU device */ + kDLCUDA = 2, + /*! + * \brief Pinned CUDA CPU memory by cudaMallocHost + */ + kDLCUDAHost = 3, + /*! \brief OpenCL devices. */ + kDLOpenCL = 4, + /*! \brief Vulkan buffer for next generation graphics. */ + kDLVulkan = 7, + /*! \brief Metal for Apple GPU. */ + kDLMetal = 8, + /*! \brief Verilog simulator buffer */ + kDLVPI = 9, + /*! \brief ROCm GPUs for AMD GPUs */ + kDLROCM = 10, + /*! + * \brief Pinned ROCm CPU memory allocated by hipMallocHost + */ + kDLROCMHost = 11, + /*! + * \brief Reserved extension device type, + * used for quickly test extension device + * The semantics can differ depending on the implementation. + */ + kDLExtDev = 12, + /*! + * \brief CUDA managed/unified memory allocated by cudaMallocManaged + */ + kDLCUDAManaged = 13, + /*! + * \brief Unified shared memory allocated on a oneAPI non-partititioned + * device. Call to oneAPI runtime is required to determine the device + * type, the USM allocation type and the sycl context it is bound to. + * + */ + kDLOneAPI = 14, + /*! \brief GPU support for next generation WebGPU standard. */ + kDLWebGPU = 15, + /*! \brief Qualcomm Hexagon DSP */ + kDLHexagon = 16, + /*! \brief Microsoft MAIA devices */ + kDLMAIA = 17, + /*! \brief AWS Trainium */ + kDLTrn = 18, + } DLDeviceType; + + /*! + * \brief A Device for Tensor and operator. + */ + typedef struct + { + /*! \brief The device type used in the device. */ + DLDeviceType device_type; + /*! + * \brief The device index. + * For vanilla CPU memory, pinned memory, or managed memory, this is set + * to 0. + */ + int32_t device_id; + } DLDevice; + + /*! + * \brief The type code options DLDataType. + */ + typedef enum + { + /*! \brief signed integer */ + kDLInt = 0U, + /*! \brief unsigned integer */ + kDLUInt = 1U, + /*! \brief IEEE floating point */ + kDLFloat = 2U, + /*! + * \brief Opaque handle type, reserved for testing purposes. + * Frameworks need to agree on the handle data type for the exchange to + * be well-defined. + */ + kDLOpaqueHandle = 3U, + /*! \brief bfloat16 */ + kDLBfloat = 4U, + /*! + * \brief complex number + * (C/C++/Python layout: compact struct per complex number) + */ + kDLComplex = 5U, + /*! \brief boolean */ + kDLBool = 6U, + /*! \brief FP8 data types */ + kDLFloat8_e3m4 = 7U, + kDLFloat8_e4m3 = 8U, + kDLFloat8_e4m3b11fnuz = 9U, + kDLFloat8_e4m3fn = 10U, + kDLFloat8_e4m3fnuz = 11U, + kDLFloat8_e5m2 = 12U, + kDLFloat8_e5m2fnuz = 13U, + kDLFloat8_e8m0fnu = 14U, + /*! \brief FP6 data types + * Setting bits != 6 is currently unspecified, and the producer must + * ensure it is set while the consumer must stop importing if the value + * is unexpected. + */ + kDLFloat6_e2m3fn = 15U, + kDLFloat6_e3m2fn = 16U, + /*! \brief FP4 data types + * Setting bits != 4 is currently unspecified, and the producer must + * ensure it is set while the consumer must stop importing if the value + * is unexpected. + */ + kDLFloat4_e2m1fn = 17U, + } DLDataTypeCode; + + /*! + * \brief The data type the tensor can hold. The data type is assumed to + * follow the native endian-ness. An explicit error message should be raised + * when attempting to export an array with non-native endianness + * + * Examples + * - float: type_code = 2, bits = 32, lanes = 1 + * - float4(vectorized 4 float): type_code = 2, bits = 32, lanes = 4 + * - int8: type_code = 0, bits = 8, lanes = 1 + * - std::complex: type_code = 5, bits = 64, lanes = 1 + * - bool: type_code = 6, bits = 8, lanes = 1 (as per common array library + * convention, the underlying storage size of bool is 8 bits) + * - float8_e4m3: type_code = 8, bits = 8, lanes = 1 (packed in memory) + * - float6_e3m2fn: type_code = 16, bits = 6, lanes = 1 (packed in memory) + * - float4_e2m1fn: type_code = 17, bits = 4, lanes = 1 (packed in memory) + * + * When a sub-byte type is packed, DLPack requires the data to be in little + * bit-endian, i.e., for a packed data set D ((D >> (i * bits)) && bit_mask) + * stores the i-th element. + */ + typedef struct + { + /*! + * \brief Type code of base types. + * We keep it uint8_t instead of DLDataTypeCode for minimal memory + * footprint, but the value should be one of DLDataTypeCode enum values. + * */ + uint8_t code; + /*! + * \brief Number of bits, common choices are 8, 16, 32. + */ + uint8_t bits; + /*! \brief Number of lanes in the type, used for vector types. */ + uint16_t lanes; + } DLDataType; + + /*! + * \brief Plain C Tensor object, does not manage memory. + */ + typedef struct + { + /*! + * \brief The data pointer points to the allocated data. This will be + * CUDA device pointer or cl_mem handle in OpenCL. It may be opaque on + * some device types. This pointer is always aligned to 256 bytes as in + * CUDA. The `byte_offset` field should be used to point to the + * beginning of the data. + * + * Note that as of Nov 2021, multiple libraries (CuPy, PyTorch, + * TensorFlow, TVM, perhaps others) do not adhere to this 256 byte + * alignment requirement on CPU/CUDA/ROCm, and always use + * `byte_offset=0`. This must be fixed (after which this note will be + * updated); at the moment it is recommended to not rely on the data + * pointer being correctly aligned. + * + * For given DLTensor, the size of memory required to store the contents + * of data is calculated as follows: + * + * \code{.c} + * static inline size_t GetDataSize(const DLTensor* t) { + * size_t size = 1; + * for (tvm_index_t i = 0; i < t->ndim; ++i) { + * size *= t->shape[i]; + * } + * size *= (t->dtype.bits * t->dtype.lanes + 7) / 8; + * return size; + * } + * \endcode + * + * Note that if the tensor is of size zero, then the data pointer should + * be set to `NULL`. + */ + void *data; + /*! \brief The device of the tensor */ + DLDevice device; + /*! \brief Number of dimensions */ + int32_t ndim; + /*! \brief The data type of the pointer*/ + DLDataType dtype; + /*! + * \brief The shape of the tensor + * + * When ndim == 0, shape can be set to NULL. + */ + int64_t *shape; + /*! + * \brief strides of the tensor (in number of elements, not bytes), + * can not be NULL if ndim != 0, must points to + * an array of ndim elements that specifies the strides, + * so consumer can always rely on strides[dim] being valid for 0 <= dim + * < ndim. + * + * When ndim == 0, strides can be set to NULL. + * + * \note Before DLPack v1.2, strides can be NULL to indicate contiguous + * data. This is not allowed in DLPack v1.2 and later. The rationale is + * to simplify the consumer handling. + */ + int64_t *strides; + /*! \brief The offset in bytes to the beginning pointer to data */ + uint64_t byte_offset; + } DLTensor; + + /*! + * \brief C Tensor object, manage memory of DLTensor. This data structure is + * intended to facilitate the borrowing of DLTensor by another framework. + * It is not meant to transfer the tensor. When the borrowing framework + * doesn't need the tensor, it should call the deleter to notify the host + * that the resource is no longer needed. + * + * \note This data structure is used as Legacy DLManagedTensor + * in DLPack exchange and is deprecated after DLPack v0.8 + * Use DLManagedTensorVersioned instead. + * This data structure may get renamed or deleted in future versions. + * + * \sa DLManagedTensorVersioned + */ + typedef struct DLManagedTensor + { + /*! \brief DLTensor which is being memory managed */ + DLTensor dl_tensor; + /*! \brief the context of the original host framework of DLManagedTensor + * in which DLManagedTensor is used in the framework. It can also be + * NULL. + */ + void *manager_ctx; + /*! + * \brief Destructor - this should be called + * to destruct the manager_ctx which backs the DLManagedTensor. It can + * be NULL if there is no way for the caller to provide a reasonable + * destructor. The destructor deletes the argument self as well. + */ + void (*deleter)(struct DLManagedTensor *self); + } DLManagedTensor; + +// bit masks used in the DLManagedTensorVersioned + +/*! \brief bit mask to indicate that the tensor is read only. */ +#define DLPACK_FLAG_BITMASK_READ_ONLY (1UL << 0UL) + +/*! + * \brief bit mask to indicate that the tensor is a copy made by the producer. + * + * If set, the tensor is considered solely owned throughout its lifetime by the + * consumer, until the producer-provided deleter is invoked. + */ +#define DLPACK_FLAG_BITMASK_IS_COPIED (1UL << 1UL) + +/*! + * \brief bit mask to indicate that whether a sub-byte type is packed or padded. + * + * The default for sub-byte types (ex: fp4/fp6) is assumed packed. This flag can + * be set by the producer to signal that a tensor of sub-byte type is padded. + */ +#define DLPACK_FLAG_BITMASK_IS_SUBBYTE_TYPE_PADDED (1UL << 2UL) + + /*! + * \brief A versioned and managed C Tensor object, manage memory of + * DLTensor. + * + * This data structure is intended to facilitate the borrowing of DLTensor + * by another framework. It is not meant to transfer the tensor. When the + * borrowing framework doesn't need the tensor, it should call the deleter + * to notify the host that the resource is no longer needed. + * + * \note This is the current standard DLPack exchange data structure. + */ + typedef struct DLManagedTensorVersioned + { + /*! + * \brief The API and ABI version of the current managed Tensor + */ + DLPackVersion version; + /*! + * \brief the context of the original host framework. + * + * Stores DLManagedTensorVersioned is used in the + * framework. It can also be NULL. + */ + void *manager_ctx; + /*! + * \brief Destructor. + * + * This should be called to destruct manager_ctx which holds the + * DLManagedTensorVersioned. It can be NULL if there is no way for the + * caller to provide a reasonable destructor. The destructor deletes the + * argument self as well. + */ + void (*deleter)(struct DLManagedTensorVersioned *self); + /*! + * \brief Additional bitmask flags information about the tensor. + * + * By default the flags should be set to 0. + * + * \note Future ABI changes should keep everything until this field + * stable, to ensure that deleter can be correctly called. + * + * \sa DLPACK_FLAG_BITMASK_READ_ONLY + * \sa DLPACK_FLAG_BITMASK_IS_COPIED + */ + uint64_t flags; + /*! \brief DLTensor which is being memory managed */ + DLTensor dl_tensor; + } DLManagedTensorVersioned; + + //---------------------------------------------------------------------- + // DLPack `__dlpack_c_exchange_api__` fast exchange protocol definitions + //---------------------------------------------------------------------- + /*! + * \brief Request a producer library to create a new tensor. + * + * Create a new `DLManagedTensorVersioned` within the context of the + * producer library. The allocation is defined via the prototype DLTensor. + * + * This function is exposed by the framework through the DLPackExchangeAPI. + * + * \param prototype The prototype DLTensor. Only the dtype, ndim, shape, + * and device fields are used. + * \param out The output DLManagedTensorVersioned. + * \param error_ctx Context for `SetError`. + * \param SetError The function to set the error. + * \return The owning DLManagedTensorVersioned* or NULL on failure. + * SetError is called exactly when NULL is returned (the implementer + * must ensure this). + * \note - As a C function, must not thrown C++ exceptions. + * - Error propagation via SetError to avoid any direct need + * of Python API. Due to this `SetError` may have to ensure the GIL + * is held since it will presumably set a Python error. + * + * \sa DLPackExchangeAPI + */ + typedef int (*DLPackManagedTensorAllocator)( // + DLTensor *prototype, + DLManagedTensorVersioned **out, + void *error_ctx, // + void (*SetError)(void *error_ctx, + const char *kind, + const char *message) // + ); + + /*! + * \brief Exports a PyObject* Tensor/NDArray to a DLManagedTensorVersioned. + * + * This function does not perform any stream synchronization. The consumer + * should query DLPackCurrentWorkStream to get the current work stream and + * launch kernels on it. + * + * This function is exposed by the framework through the DLPackExchangeAPI. + * + * \param py_object The Python object to convert. Must have the same type + * as the one the `DLPackExchangeAPI` was discovered from. + * \param out The output DLManagedTensorVersioned. + * \return The owning DLManagedTensorVersioned* or NULL on failure with a + * Python exception set. If the data cannot be described using + * DLPack this should be a BufferError if possible. \note - As a C function, + * must not thrown C++ exceptions. + * + * \sa DLPackExchangeAPI, DLPackCurrentWorkStream + */ + typedef int (*DLPackManagedTensorFromPyObjectNoSync)( // + void *py_object, // + DLManagedTensorVersioned **out // + ); + + /*! + * \brief Exports a PyObject* Tensor/NDArray to a provided DLTensor. + * + * This function provides a faster interface for temporary, non-owning, + * exchange. The producer (implementer) still owns the memory of data, + * strides, shape. The liveness of the DLTensor and the data it views is + * only guaranteed until control is returned. + * + * This function currently assumes that the producer (implementer) can fill + * in the DLTensor shape and strides without the need for temporary + * allocations. + * + * This function does not perform any stream synchronization. The consumer + * should query DLPackCurrentWorkStream to get the current work stream and + * launch kernels on it. + * + * This function is exposed by the framework through the DLPackExchangeAPI. + * + * \param py_object The Python object to convert. Must have the same type + * as the one the `DLPackExchangeAPI` was discovered from. + * \param out The output DLTensor, whose space is pre-allocated on stack. + * \return 0 on success, -1 on failure with a Python exception set. + * \note - As a C function, must not thrown C++ exceptions. + * + * \sa DLPackExchangeAPI, DLPackCurrentWorkStream + */ + typedef int (*DLPackDLTensorFromPyObjectNoSync)( // + void *py_object, // + DLTensor *out // + ); + + /*! + * \brief Obtain the current work stream of a device. + * + * Obtain the current work stream of a device from the producer framework. + * For example, it should map to torch.cuda.current_stream in PyTorch. + * + * When device_type is kDLCPU, the consumer do not have to query the stream + * and the producer can simply return NULL when queried. + * The consumer do not have to do anything on stream sync or setting. + * So CPU only framework can just provide a dummy implementation that + * always set out_current_stream[0] to NULL. + * + * \param device_type The device type. + * \param device_id The device id. + * \param out_current_stream The output current work stream. + * + * \return 0 on success, -1 on failure with a Python exception set. + * \note - As a C function, must not thrown C++ exceptions. + * + * \sa DLPackExchangeAPI + */ + typedef int (*DLPackCurrentWorkStream)( // + DLDeviceType device_type, // + int32_t device_id, // + void **out_current_stream // + ); + + /*! + * \brief Imports a DLManagedTensorVersioned to a PyObject* Tensor/NDArray. + * + * Convert an owning DLManagedTensorVersioned* to the Python tensor of the + * producer (implementer) library with the correct type. + * + * This function does not perform any stream synchronization. + * + * This function is exposed by the framework through the DLPackExchangeAPI. + * + * \param tensor The DLManagedTensorVersioned to convert the ownership of + * the tensor is stolen. \param out_py_object The output Python object. + * \return 0 on success, -1 on failure with a Python exception set. + * + * \sa DLPackExchangeAPI + */ + typedef int (*DLPackManagedTensorToPyObjectNoSync)( // + DLManagedTensorVersioned *tensor, // + void **out_py_object // + ); + + /*! + * \brief DLPackExchangeAPI stable header. + * \sa DLPackExchangeAPI + */ + typedef struct DLPackExchangeAPIHeader + { + /*! + * \brief The provided DLPack version the consumer must check major + * version compatibility before using this struct. + */ + DLPackVersion version; + /*! + * \brief Optional pointer to an older DLPackExchangeAPI in the chain. + * + * It must be NULL if the framework does not support older versions. + * If the current major version is larger than the one supported by the + * consumer, the consumer may walk this to find an earlier supported + * version. + * + * \sa DLPackExchangeAPI + */ + struct DLPackExchangeAPIHeader *prev_api; + } DLPackExchangeAPIHeader; + + /*! + * \brief Framework-specific function pointers table for DLPack exchange. + * + * Additionally to `__dlpack__()` we define a C function table sharable by + * + * Python implementations via `__dlpack_c_exchange_api__`. + * This attribute must be set on the type as a Python PyCapsule + * with name "dlpack_exchange_api". + * + * A consumer library may use a pattern such as: + * + * \code + * + * PyObject *api_capsule = PyObject_GetAttrString( + * (PyObject *)Py_TYPE(tensor_obj), "__dlpack_c_exchange_api__") + * ); + * if (api_capsule == NULL) { goto handle_error; } + * MyDLPackExchangeAPI *api = (MyDLPackExchangeAPI *)PyCapsule_GetPointer( + * api_capsule, "dlpack_exchange_api" + * ); + * Py_DECREF(api_capsule); + * if (api == NULL) { goto handle_error; } + * + * \endcode + * + * Note that this must be defined on the type. The consumer should look up + * the attribute on the type and may cache the result for each unique type. + * + * The precise API table is given by: + * \code + * struct MyDLPackExchangeAPI : public DLPackExchangeAPI { + * MyDLPackExchangeAPI() { + * header.version.major = DLPACK_MAJOR_VERSION; + * header.version.minor = DLPACK_MINOR_VERSION; + * header.prev_version_api = nullptr; + * + * managed_tensor_allocator = MyDLPackManagedTensorAllocator; + * managed_tensor_from_py_object_no_sync = + * MyDLPackManagedTensorFromPyObjectNoSync; + * managed_tensor_to_py_object_no_sync = + * MyDLPackManagedTensorToPyObjectNoSync; dltensor_from_py_object_no_sync = + * MyDLPackDLTensorFromPyObjectNoSync; current_work_stream = + * MyDLPackCurrentWorkStream; + * } + * + * static const DLPackExchangeAPI* Global() { + * static MyDLPackExchangeAPI inst; + * return &inst; + * } + * }; + * \endcode + * + * Guidelines for leveraging DLPackExchangeAPI: + * + * There are generally two kinds of consumer needs for DLPack exchange: + * - N0: library support, where consumer.kernel(x, y, z) would like to run a + * kernel with the data from x, y, z. The consumer is also expected to run + * the kernel with the same stream context as the producer. For example, + * when x, y, z is torch.Tensor, consumer should query + * exchange_api->current_work_stream to get the current stream and launch + * the kernel with the same stream. This setup is necessary for no + * synchronization in kernel launch and maximum compatibility with CUDA + * graph capture in the producer. This is the desirable behavior for library + * extension support for frameworks like PyTorch. + * - N1: data ingestion and retention + * + * Note that obj.__dlpack__() API should provide useful ways for N1. + * The primary focus of the current DLPackExchangeAPI is to enable faster + * exchange N0 with the support of the function pointer current_work_stream. + * + * Array/Tensor libraries should statically create and initialize this + * structure then return a pointer to DLPackExchangeAPI as an int value in + * Tensor/Array. The DLPackExchangeAPI* must stay alive throughout the + * lifetime of the process. + * + * One simple way to do so is to create a static instance of + * DLPackExchangeAPI within the framework and return a pointer to it. The + * following code shows an example to do so in C++. It should also be + * reasonably easy to do so in other languages. + */ + typedef struct DLPackExchangeAPI + { + /*! + * \brief The header that remains stable across versions. + */ + DLPackExchangeAPIHeader header; + /*! + * \brief Producer function pointer for DLPackManagedTensorAllocator + * This function must not be NULL. + * \sa DLPackManagedTensorAllocator + */ + DLPackManagedTensorAllocator managed_tensor_allocator; + /*! + * \brief Producer function pointer for DLPackManagedTensorFromPyObject + * This function must be not NULL. + * \sa DLPackManagedTensorFromPyObject + */ + DLPackManagedTensorFromPyObjectNoSync + managed_tensor_from_py_object_no_sync; + /*! + * \brief Producer function pointer for DLPackManagedTensorToPyObject + * This function must be not NULL. + * \sa DLPackManagedTensorToPyObjectNoSync + */ + DLPackManagedTensorToPyObjectNoSync managed_tensor_to_py_object_no_sync; + /*! + * \brief Producer function pointer for DLPackDLTensorFromPyObject + * This function can be NULL when the producer does not support + * this function. \sa DLPackDLTensorFromPyObjectNoSync + */ + DLPackDLTensorFromPyObjectNoSync dltensor_from_py_object_no_sync; + /*! + * \brief Producer function pointer for DLPackCurrentWorkStream + * This function must be not NULL. + * \sa DLPackCurrentWorkStream + */ + DLPackCurrentWorkStream current_work_stream; + } DLPackExchangeAPI; + +#ifdef __cplusplus +} // DLPACK_EXTERN_C +#endif +#endif // DLPACK_DLPACK_H_ diff --git a/dpnp/tensor/libtensor/include/kernels/accumulators.hpp b/dpnp/tensor/libtensor/include/kernels/accumulators.hpp new file mode 100644 index 000000000000..9449c030ac67 --- /dev/null +++ b/dpnp/tensor/libtensor/include/kernels/accumulators.hpp @@ -0,0 +1,1427 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for accumulators (cumulative sum, prod, etc.). +//===---------------------------------------------------------------------===// + +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "dpctl_tensor_types.hpp" +#include "utils/offset_utils.hpp" +#include "utils/sycl_alloc_utils.hpp" +#include "utils/sycl_utils.hpp" +#include "utils/type_dispatch_building.hpp" +#include "utils/type_utils.hpp" + +namespace dpctl::tensor::kernels::accumulators +{ + +namespace su_ns = dpctl::tensor::sycl_utils; + +using dpctl::tensor::ssize_t; +using namespace dpctl::tensor::offset_utils; + +template +T ceiling_quotient(T n, T m) +{ + return (n + m - 1) / m; +} + +template +struct NonZeroIndicator +{ + constexpr NonZeroIndicator() {} + + outputT operator()(const inputT &val) const + { + static constexpr outputT out_one(1); + static constexpr outputT out_zero(0); + static constexpr inputT val_zero(0); + + return (val == val_zero) ? out_zero : out_one; + } +}; + +template +struct NoOpTransformer +{ + constexpr NoOpTransformer() {} + + T operator()(const T &val) const { return val; } +}; + +template +struct CastTransformer +{ + constexpr CastTransformer() {} + + dstTy operator()(const srcTy &val) const + { + using dpctl::tensor::type_utils::convert_impl; + return convert_impl(val); + } +}; + +template +struct needs_workaround +{ + // workaround needed due to crash in JITing on CPU + // remove when CMPLRLLVM-65813 is resolved + static constexpr bool value = su_ns::IsSyclLogicalAnd::value || + su_ns::IsSyclLogicalOr::value; +}; + +template +struct can_use_inclusive_scan_over_group +{ + static constexpr bool value = sycl::has_known_identity::value && + !needs_workaround::value; +}; + +namespace detail +{ +template +class stack_t +{ + T *src_; + std::size_t size_; + T *local_scans_; + +public: + stack_t() : src_{}, size_{}, local_scans_{} {} + stack_t(T *src, std::size_t sz, T *local_scans) + : src_(src), size_(sz), local_scans_(local_scans) + { + } + ~stack_t() {}; + + T *get_src_ptr() const { return src_; } + + std::size_t get_size() const { return size_; } + + T *get_local_scans_ptr() const { return local_scans_; } +}; + +template +class stack_strided_t +{ + T *src_; + std::size_t size_; + T *local_scans_; + std::size_t local_stride_; + +public: + stack_strided_t() : src_{}, size_{}, local_scans_{}, local_stride_{} {} + stack_strided_t(T *src, + std::size_t sz, + T *local_scans, + std::size_t local_stride) + : src_(src), size_(sz), local_scans_(local_scans), + local_stride_(local_stride) + { + } + ~stack_strided_t() {}; + + T *get_src_ptr() const { return src_; } + + std::size_t get_size() const { return size_; } + + T *get_local_scans_ptr() const { return local_scans_; } + + std::size_t get_local_stride() const { return local_stride_; } +}; + +} // end of namespace detail + +// Iterative cumulative summation + +using nwiT = std::uint32_t; + +template +class inclusive_scan_iter_local_scan_blocked_krn; + +template +class inclusive_scan_iter_local_scan_striped_krn; + +template +sycl::event inclusive_scan_base_step_blocked( + sycl::queue &exec_q, + const std::uint32_t wg_size, + const std::size_t iter_nelems, + const std::size_t acc_nelems, + const inputT *input, + outputT *output, + const std::size_t s0, + const std::size_t s1, + const IterIndexerT &iter_indexer, + const InpIndexerT &inp_indexer, + const OutIndexerT &out_indexer, + TransformerT transformer, + const ScanOpT &scan_op, + outputT identity, + std::size_t &acc_groups, + const std::vector &depends = {}) +{ + acc_groups = ceiling_quotient(acc_nelems, n_wi * wg_size); + + sycl::event inc_scan_phase1_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + using slmT = sycl::local_accessor; + + auto gws = sycl::range<1>(iter_nelems * acc_groups * wg_size); + auto lws = sycl::range<1>(wg_size); + + auto ndRange = sycl::nd_range<1>(gws, lws); + + slmT slm_iscan_tmp(lws, cgh); + + using KernelName = inclusive_scan_iter_local_scan_blocked_krn< + inputT, outputT, n_wi, IterIndexerT, InpIndexerT, OutIndexerT, + TransformerT, ScanOpT, include_initial>; + + cgh.parallel_for(ndRange, [=, slm_iscan_tmp = + std::move(slm_iscan_tmp)]( + sycl::nd_item<1> it) { + const std::size_t gid = it.get_global_id(0); + const std::size_t lid = it.get_local_id(0); + + const std::uint32_t wg_size = it.get_local_range(0); + const std::size_t reduce_chunks = acc_groups * wg_size; + const std::size_t iter_gid = gid / reduce_chunks; + const std::size_t chunk_gid = gid - (iter_gid * reduce_chunks); + + const std::size_t i = chunk_gid * n_wi; + const auto &iter_offsets = iter_indexer(iter_gid); + const auto &inp_iter_offset = iter_offsets.get_first_offset(); + const auto &out_iter_offset = iter_offsets.get_second_offset(); + + std::array local_iscan; + +#pragma unroll + for (nwiT m_wi = 0; m_wi < n_wi; ++m_wi) { + const std::size_t i_m_wi = i + m_wi; + if constexpr (!include_initial) { + local_iscan[m_wi] = + (i_m_wi < acc_nelems) + ? transformer(input[inp_iter_offset + + inp_indexer(s0 + s1 * i_m_wi)]) + : identity; + } + else { + // shift input to the left by a single element relative to + // output + local_iscan[m_wi] = + (i_m_wi < acc_nelems && i_m_wi > 0) + ? transformer( + input[inp_iter_offset + + inp_indexer((s0 + s1 * i_m_wi) - 1)]) + : identity; + } + } + +#pragma unroll + for (nwiT m_wi = 1; m_wi < n_wi; ++m_wi) { + local_iscan[m_wi] = + scan_op(local_iscan[m_wi], local_iscan[m_wi - 1]); + } + // local_iscan is now result of + // inclusive scan of locally stored inputs + + outputT wg_iscan_val; + if constexpr (can_use_inclusive_scan_over_group::value) { + wg_iscan_val = sycl::inclusive_scan_over_group( + it.get_group(), local_iscan.back(), scan_op, identity); + } + else { + wg_iscan_val = su_ns::custom_inclusive_scan_over_group( + it.get_group(), it.get_sub_group(), slm_iscan_tmp, + local_iscan.back(), identity, scan_op); + // ensure all finished reading from SLM, to avoid race condition + // with subsequent writes into SLM + it.barrier(sycl::access::fence_space::local_space); + } + + slm_iscan_tmp[(lid + 1) % wg_size] = wg_iscan_val; + it.barrier(sycl::access::fence_space::local_space); + const outputT modifier = (lid == 0) ? identity : slm_iscan_tmp[lid]; + +#pragma unroll + for (nwiT m_wi = 0; m_wi < n_wi; ++m_wi) { + local_iscan[m_wi] = scan_op(local_iscan[m_wi], modifier); + } + + const std::size_t start = std::min(i, acc_nelems); + const std::size_t end = std::min(i + n_wi, acc_nelems); + const nwiT m_max = static_cast(end - start); + for (nwiT m_wi = 0; m_wi < m_max; ++m_wi) { + output[out_iter_offset + out_indexer(i + m_wi)] = + local_iscan[m_wi]; + } + }); + }); + + return inc_scan_phase1_ev; +} + +template +sycl::event inclusive_scan_base_step_striped( + sycl::queue &exec_q, + const std::uint32_t wg_size, + const std::size_t iter_nelems, + const std::size_t acc_nelems, + const inputT *input, + outputT *output, + const std::size_t s0, + const std::size_t s1, + const IterIndexerT &iter_indexer, + const InpIndexerT &inp_indexer, + const OutIndexerT &out_indexer, + TransformerT transformer, + const ScanOpT &scan_op, + outputT identity, + std::size_t &acc_groups, + const std::vector &depends = {}) +{ + const std::uint32_t reduce_nelems_per_wg = n_wi * wg_size; + acc_groups = + ceiling_quotient(acc_nelems, reduce_nelems_per_wg); + + sycl::event inc_scan_phase1_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + using slmT = sycl::local_accessor; + + const auto &gRange = sycl::range<1>{iter_nelems * acc_groups * wg_size}; + const auto &lRange = sycl::range<1>{wg_size}; + + const auto &ndRange = sycl::nd_range<1>{gRange, lRange}; + + slmT slm_iscan_tmp(reduce_nelems_per_wg, cgh); + + using KernelName = inclusive_scan_iter_local_scan_striped_krn< + inputT, outputT, n_wi, IterIndexerT, InpIndexerT, OutIndexerT, + TransformerT, ScanOpT, include_initial>; + + cgh.parallel_for(ndRange, [=, slm_iscan_tmp = + std::move(slm_iscan_tmp)]( + sycl::nd_item<1> it) { + const std::uint32_t lid = it.get_local_linear_id(); + const std::uint32_t wg_size = it.get_local_range(0); + + const auto &sg = it.get_sub_group(); + const std::uint32_t sgSize = sg.get_max_local_range()[0]; + const std::size_t sgroup_id = sg.get_group_id()[0]; + const std::uint32_t lane_id = sg.get_local_id()[0]; + + const std::size_t flat_group_id = it.get_group(0); + const std::size_t iter_gid = flat_group_id / acc_groups; + const std::size_t acc_group_id = + flat_group_id - (iter_gid * acc_groups); + + const auto &iter_offsets = iter_indexer(iter_gid); + const auto &inp_iter_offset = iter_offsets.get_first_offset(); + const auto &out_iter_offset = iter_offsets.get_second_offset(); + + std::array local_iscan{}; + + const std::size_t inp_id0 = acc_group_id * n_wi * wg_size + + sgroup_id * n_wi * sgSize + lane_id; + +#pragma unroll + for (nwiT m_wi = 0; m_wi < n_wi; ++m_wi) { + const std::size_t inp_id = inp_id0 + m_wi * sgSize; + if constexpr (!include_initial) { + local_iscan[m_wi] = + (inp_id < acc_nelems) + ? transformer(input[inp_iter_offset + + inp_indexer(s0 + s1 * inp_id)]) + : identity; + } + else { + // shift input to the left by a single element relative to + // output + local_iscan[m_wi] = + (inp_id < acc_nelems && inp_id > 0) + ? transformer( + input[inp_iter_offset + + inp_indexer((s0 + s1 * inp_id) - 1)]) + : identity; + } + } + + // change layout from striped to blocked + { + { + const std::uint32_t local_offset0 = lid * n_wi; +#pragma unroll + for (std::uint32_t i = 0; i < n_wi; ++i) { + slm_iscan_tmp[local_offset0 + i] = local_iscan[i]; + } + + it.barrier(sycl::access::fence_space::local_space); + } + + { + const std::uint32_t block_offset = + sgroup_id * sgSize * n_wi; + const std::uint32_t disp0 = lane_id * n_wi; +#pragma unroll + for (nwiT i = 0; i < n_wi; ++i) { + const std::uint32_t disp = disp0 + i; + + // disp == lane_id1 + i1 * sgSize; + const std::uint32_t i1 = disp / sgSize; + const std::uint32_t lane_id1 = disp - i1 * sgSize; + + const std::uint32_t disp_exchanged = + (lane_id1 * n_wi + i1); + + local_iscan[i] = + slm_iscan_tmp[block_offset + disp_exchanged]; + } + + it.barrier(sycl::access::fence_space::local_space); + } + } + +#pragma unroll + for (nwiT m_wi = 1; m_wi < n_wi; ++m_wi) { + local_iscan[m_wi] = + scan_op(local_iscan[m_wi], local_iscan[m_wi - 1]); + } + // local_iscan is now result of + // inclusive scan of locally stored inputs + + outputT wg_iscan_val; + if constexpr (can_use_inclusive_scan_over_group::value) { + wg_iscan_val = sycl::inclusive_scan_over_group( + it.get_group(), local_iscan.back(), scan_op, identity); + } + else { + wg_iscan_val = su_ns::custom_inclusive_scan_over_group( + it.get_group(), sg, slm_iscan_tmp, local_iscan.back(), + identity, scan_op); + // ensure all finished reading from SLM, to avoid race condition + // with subsequent writes into SLM + it.barrier(sycl::access::fence_space::local_space); + } + + slm_iscan_tmp[(lid + 1) % wg_size] = wg_iscan_val; + it.barrier(sycl::access::fence_space::local_space); + const outputT modifier = (lid == 0) ? identity : slm_iscan_tmp[lid]; + +#pragma unroll + for (nwiT m_wi = 0; m_wi < n_wi; ++m_wi) { + local_iscan[m_wi] = scan_op(local_iscan[m_wi], modifier); + } + + it.barrier(sycl::access::fence_space::local_space); + + // convert back to blocked layout + { + { + const std::uint32_t local_offset0 = lid * n_wi; +#pragma unroll + for (nwiT m_wi = 0; m_wi < n_wi; ++m_wi) { + slm_iscan_tmp[local_offset0 + m_wi] = local_iscan[m_wi]; + } + + it.barrier(sycl::access::fence_space::local_space); + } + } + + { + const std::uint32_t block_offset = + sgroup_id * sgSize * n_wi + lane_id; +#pragma unroll + for (nwiT m_wi = 0; m_wi < n_wi; ++m_wi) { + const std::uint32_t m_wi_scaled = m_wi * sgSize; + const std::size_t out_id = inp_id0 + m_wi_scaled; + if (out_id < acc_nelems) { + output[out_iter_offset + out_indexer(out_id)] = + slm_iscan_tmp[block_offset + m_wi_scaled]; + } + } + } + }); + }); + + return inc_scan_phase1_ev; +} + +template +sycl::event + inclusive_scan_base_step(sycl::queue &exec_q, + const std::uint32_t wg_size, + const std::size_t iter_nelems, + const std::size_t acc_nelems, + const inputT *input, + outputT *output, + const std::size_t s0, + const std::size_t s1, + const IterIndexerT &iter_indexer, + const InpIndexerT &inp_indexer, + const OutIndexerT &out_indexer, + TransformerT transformer, + const ScanOpT &scan_op, + outputT identity, + std::size_t &acc_groups, + const std::vector &depends = {}) +{ + // For small stride use striped load/store. + // Threshold value chosen experimentally. + if (s1 <= 16) { + return inclusive_scan_base_step_striped< + inputT, outputT, n_wi, IterIndexerT, InpIndexerT, OutIndexerT, + TransformerT, ScanOpT, include_initial>( + exec_q, wg_size, iter_nelems, acc_nelems, input, output, s0, s1, + iter_indexer, inp_indexer, out_indexer, transformer, scan_op, + identity, acc_groups, depends); + } + else { + return inclusive_scan_base_step_blocked< + inputT, outputT, n_wi, IterIndexerT, InpIndexerT, OutIndexerT, + TransformerT, ScanOpT, include_initial>( + exec_q, wg_size, iter_nelems, acc_nelems, input, output, s0, s1, + iter_indexer, inp_indexer, out_indexer, transformer, scan_op, + identity, acc_groups, depends); + } +} + +template +class inclusive_scan_1d_iter_chunk_update_krn; + +template +sycl::event update_local_chunks_1d(sycl::queue &exec_q, + outputT *src, + std::size_t src_size, + const outputT *local_scans, + std::size_t chunk_size, + const sycl::event &dependent_event) +{ + const auto &ctx = exec_q.get_context(); + const auto &dev = exec_q.get_device(); + + const auto &kernel_id = sycl::get_kernel_id(); + auto kb = sycl::get_kernel_bundle( + ctx, {dev}, {kernel_id}); + auto krn = kb.get_kernel(kernel_id); + + const std::uint32_t sg_size = krn.template get_info< + sycl::info::kernel_device_specific::max_sub_group_size>(dev); + + // output[ chunk_size * (i + 1) + j] += temp[i] + sycl::event update_event = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(dependent_event); + cgh.use_kernel_bundle(kb); + + static constexpr nwiT updates_per_wi = n_wi; + const std::size_t n_items = + ceiling_quotient(src_size, sg_size * n_wi) * sg_size; + + sycl::range<1> gRange{n_items}; + sycl::range<1> lRange{sg_size}; + sycl::nd_range<1> ndRange{gRange, lRange}; + + cgh.parallel_for( + ndRange, + [chunk_size, src, src_size, local_scans](sycl::nd_item<1> ndit) { + static constexpr ScanOpT scan_op{}; + static constexpr outputT identity = + su_ns::Identity::value; + + const std::uint32_t lws = ndit.get_local_range(0); + const std::size_t block_offset = ndit.get_group(0) * n_wi * lws; +#pragma unroll + for (std::size_t i = 0; i < updates_per_wi; ++i) { + const std::size_t src_id = + block_offset + ndit.get_local_id(0) + i * lws; + if (src_id < src_size) { + const std::size_t scan_id = (src_id / chunk_size); + const outputT modifier = + (scan_id > 0) ? local_scans[scan_id - 1] : identity; + src[src_id] = scan_op(src[src_id], modifier); + } + } + }); + }); + + return update_event; +} + +/* + * output[j] = sum( input[s0 + i * s1], 0 <= i <= j) + * for 0 <= j < n_elems + */ +template +sycl::event inclusive_scan_iter_1d(sycl::queue &exec_q, + const std::uint32_t wg_size, + const std::size_t n_elems, + const inputT *input, + outputT *output, + const std::size_t s0, + const std::size_t s1, + const IndexerT &indexer, + const TransformerT &transformer, + std::vector &host_tasks, + const std::vector &depends = {}) +{ + static constexpr ScanOpT scan_op{}; + static constexpr outputT identity = + su_ns::Identity::value; + + static constexpr std::size_t _iter_nelems = 1; + + using IterIndexerT = dpctl::tensor::offset_utils::TwoZeroOffsets_Indexer; + static constexpr IterIndexerT _no_op_iter_indexer{}; + + using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + static constexpr NoOpIndexerT _no_op_indexer{}; + + std::size_t n_groups; + sycl::event inc_scan_phase1_ev = + inclusive_scan_base_step( + exec_q, wg_size, _iter_nelems, n_elems, input, output, s0, s1, + _no_op_iter_indexer, indexer, _no_op_indexer, transformer, scan_op, + identity, n_groups, depends); + + sycl::event dependent_event = inc_scan_phase1_ev; + if (n_groups > 1) { + const std::size_t chunk_size = wg_size * n_wi; + + // how much of temporary allocation do we need + std::size_t n_groups_ = n_groups; + std::size_t temp_size = 0; + while (n_groups_ > 1) { + const std::size_t this_size = (n_groups_ - 1); + temp_size += this_size; + n_groups_ = ceiling_quotient(this_size, chunk_size); + } + + // allocate + auto temp_owner = + dpctl::tensor::alloc_utils::smart_malloc_device(temp_size, + exec_q); + outputT *temp = temp_owner.get(); + + std::vector> stack{}; + + // inclusive scans over blocks + n_groups_ = n_groups; + outputT *src = output; + outputT *local_scans = temp; + + using NoOpTransformerT = NoOpTransformer; + static constexpr NoOpTransformerT _no_op_transformer{}; + std::size_t size_to_update = n_elems; + while (n_groups_ > 1) { + + const std::size_t src_size = n_groups_ - 1; + dependent_event = + inclusive_scan_base_step( + exec_q, wg_size, _iter_nelems, src_size, src, local_scans, + chunk_size - 1, chunk_size, _no_op_iter_indexer, + _no_op_indexer, _no_op_indexer, _no_op_transformer, scan_op, + identity, n_groups_, // n_groups_ is modified in place + {dependent_event}); + stack.push_back({src, size_to_update, local_scans}); + src = local_scans; + local_scans += src_size; + size_to_update = src_size; + } + + for (std::size_t reverse_stack_id = 0; reverse_stack_id < stack.size(); + ++reverse_stack_id) { + const std::size_t stack_id = stack.size() - 1 - reverse_stack_id; + + const auto &stack_elem = stack[stack_id]; + outputT *src = stack_elem.get_src_ptr(); + const std::size_t src_size = stack_elem.get_size(); + const outputT *local_scans = stack_elem.get_local_scans_ptr(); + + using UpdateKernelName = + class inclusive_scan_1d_iter_chunk_update_krn; + + dependent_event = update_local_chunks_1d( + exec_q, src, src_size, local_scans, chunk_size, + dependent_event); + } + + sycl::event free_ev = dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {dependent_event}, temp_owner); + + host_tasks.push_back(free_ev); + } + + return dependent_event; +} + +typedef sycl::event (*accumulate_1d_contig_impl_fn_ptr_t)( + sycl::queue &, + std::size_t, + const char *, + char *, + std::vector &, + const std::vector &); + +template +sycl::event + accumulate_1d_contig_impl(sycl::queue &q, + std::size_t n_elems, + const char *src, + char *dst, + std::vector &host_tasks, + const std::vector &depends = {}) +{ + const srcT *src_data_ptr = reinterpret_cast(src); + dstT *dst_data_ptr = reinterpret_cast(dst); + + using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + static constexpr NoOpIndexerT flat_indexer{}; + static constexpr transformerT transformer{}; + + static constexpr std::size_t s0 = 0; + static constexpr std::size_t s1 = 1; + + sycl::event comp_ev; + const sycl::device &dev = q.get_device(); + if (dev.has(sycl::aspect::cpu)) { + static constexpr nwiT n_wi_for_cpu = 8; + const std::uint32_t wg_size = 256; + comp_ev = inclusive_scan_iter_1d( + q, wg_size, n_elems, src_data_ptr, dst_data_ptr, s0, s1, + flat_indexer, transformer, host_tasks, depends); + } + else { + static constexpr nwiT n_wi_for_gpu = 4; + // base_scan_striped algorithm does not execute correctly + // on HIP device with wg_size > 64 + const std::uint32_t wg_size = + (q.get_backend() == sycl::backend::ext_oneapi_hip) ? 64 : 256; + comp_ev = inclusive_scan_iter_1d( + q, wg_size, n_elems, src_data_ptr, dst_data_ptr, s0, s1, + flat_indexer, transformer, host_tasks, depends); + } + return comp_ev; +} + +template +class inclusive_scan_final_chunk_update_krn; + +template +sycl::event final_update_local_chunks(sycl::queue &exec_q, + std::size_t iter_nelems, + outputT *src, + std::size_t src_size, + const outputT *local_scans, + std::size_t chunk_size, + std::size_t local_stride, + const OutIterIndexerT &out_iter_indexer, + const OutIndexerT &out_indexer, + sycl::event dependent_event) +{ + const auto &kernel_id = sycl::get_kernel_id(); + + auto const &ctx = exec_q.get_context(); + auto const &dev = exec_q.get_device(); + auto kb = sycl::get_kernel_bundle( + ctx, {dev}, {kernel_id}); + + auto krn = kb.get_kernel(kernel_id); + + const std::uint32_t sg_size = krn.template get_info< + sycl::info::kernel_device_specific::max_sub_group_size>(dev); + + static constexpr nwiT updates_per_wi = n_wi; + const std::size_t updates_per_sg = sg_size * updates_per_wi; + const std::size_t update_nelems = + ceiling_quotient(src_size, updates_per_sg) * sg_size; + + sycl::range<2> gRange{iter_nelems, update_nelems}; + sycl::range<2> lRange{1, sg_size}; + + sycl::nd_range<2> ndRange{gRange, lRange}; + + sycl::event update_event = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(dependent_event); + + cgh.parallel_for( + ndRange, [chunk_size, src_size, local_stride, src, local_scans, + out_iter_indexer, out_indexer](sycl::nd_item<2> ndit) { + static constexpr ScanOpT scan_op{}; + static constexpr outputT identity = + su_ns::Identity::value; + + const std::uint32_t lws = ndit.get_local_range(1); + + const std::size_t iter_gid = ndit.get_group(0); + + const std::size_t src_axis_id0 = + ndit.get_group(1) * updates_per_wi * lws + + ndit.get_local_id(1); + const std::size_t src_iter_id = out_iter_indexer(iter_gid); +#pragma unroll + for (nwiT i = 0; i < updates_per_wi; ++i) { + const std::size_t src_axis_id = src_axis_id0 + i * lws; + const std::size_t src_id = + out_indexer(src_axis_id) + src_iter_id; + + if (src_axis_id < src_size) { + const std::size_t scan_axis_id = + src_axis_id / chunk_size; + const std::size_t scan_id = + scan_axis_id + iter_gid * local_stride; + + const outputT modifier = (scan_axis_id > 0) + ? local_scans[scan_id - 1] + : identity; + + src[src_id] = scan_op(src[src_id], modifier); + } + } + }); + }); + + return update_event; +} + +template +class inclusive_scan_iter_chunk_update_krn; + +template +sycl::event update_local_chunks(sycl::queue &exec_q, + std::size_t iter_nelems, + outputT *src, + std::size_t src_size, + const outputT *local_scans, + std::size_t chunk_size, + std::size_t local_stride, + sycl::event dependent_event) +{ + static constexpr NoOpIndexer out_indexer{}; + static constexpr NoOpIndexer iter_out_indexer{}; + + return final_update_local_chunks( + exec_q, iter_nelems, src, src_size, local_scans, chunk_size, + local_stride, iter_out_indexer, out_indexer, dependent_event); +} + +template +sycl::event inclusive_scan_iter(sycl::queue &exec_q, + const std::uint32_t wg_size, + const std::size_t iter_nelems, + const std::size_t acc_nelems, + const inputT *input, + outputT *output, + const std::size_t s0, + const std::size_t s1, + const InpIterIndexerT &inp_iter_indexer, + const OutIterIndexerT &out_iter_indexer, + const InpIndexerT &inp_indexer, + const OutIndexerT &out_indexer, + const TransformerT &transformer, + std::vector &host_tasks, + const std::vector &depends = {}) +{ + static constexpr ScanOpT scan_op{}; + static constexpr outputT identity = + su_ns::Identity::value; + + using IterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + InpIterIndexerT, OutIterIndexerT>; + const IterIndexerT iter_indexer{inp_iter_indexer, out_iter_indexer}; + + std::size_t acc_groups; + sycl::event inc_scan_phase1_ev = + inclusive_scan_base_step( + exec_q, wg_size, iter_nelems, acc_nelems, input, output, s0, s1, + iter_indexer, inp_indexer, out_indexer, transformer, scan_op, + identity, acc_groups, depends); + + sycl::event dependent_event = inc_scan_phase1_ev; + if (acc_groups > 1) { + const std::size_t chunk_size = wg_size * n_wi; + + // how much of temporary allocation do we need + std::size_t acc_groups_ = acc_groups; + std::size_t temp_size = 0; + while (acc_groups_ > 1) { + const std::size_t this_size = (acc_groups_ - 1); + temp_size += this_size; + acc_groups_ = ceiling_quotient(this_size, chunk_size); + } + + // allocate + auto temp_owner = + dpctl::tensor::alloc_utils::smart_malloc_device( + iter_nelems * temp_size, exec_q); + outputT *temp = temp_owner.get(); + + std::vector> stack{}; + + // inclusive scans over blocks + acc_groups_ = acc_groups; + outputT *src = output; + outputT *local_scans = temp; + + using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + static constexpr NoOpIndexerT _no_op_indexer{}; + using NoOpTransformerT = NoOpTransformer; + static constexpr NoOpTransformerT _no_op_transformer{}; + std::size_t size_to_update = acc_nelems; + + { + std::size_t src_size = acc_groups - 1; + using LocalScanIndexerT = + dpctl::tensor::offset_utils::Strided1DIndexer; + const LocalScanIndexerT scan_iter_indexer{/* size */ iter_nelems, + /* step */ src_size}; + + using IterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + OutIterIndexerT, LocalScanIndexerT>; + const IterIndexerT iter_indexer_{out_iter_indexer, + scan_iter_indexer}; + + dependent_event = + inclusive_scan_base_step( + exec_q, wg_size, iter_nelems, src_size, src, local_scans, + chunk_size - 1, chunk_size, iter_indexer_, out_indexer, + _no_op_indexer, _no_op_transformer, scan_op, identity, + acc_groups_, // acc_groups_ is modified in place + {dependent_event}); + stack.push_back({src, size_to_update, local_scans, src_size}); + src = local_scans; + local_scans += src_size * iter_nelems; + size_to_update = src_size; + } + + while (acc_groups_ > 1) { + std::size_t src_size = acc_groups_ - 1; + + using LocalScanIndexerT = + dpctl::tensor::offset_utils::Strided1DIndexer; + const LocalScanIndexerT scan1_iter_indexer{ + /* size */ iter_nelems, + /* step */ size_to_update}; + const LocalScanIndexerT scan2_iter_indexer{/* size */ iter_nelems, + /* step */ src_size}; + + using IterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + LocalScanIndexerT, LocalScanIndexerT>; + const IterIndexerT iter_indexer_{scan1_iter_indexer, + scan2_iter_indexer}; + + dependent_event = + inclusive_scan_base_step( + exec_q, wg_size, iter_nelems, src_size, src, local_scans, + chunk_size - 1, chunk_size, iter_indexer_, _no_op_indexer, + _no_op_indexer, _no_op_transformer, scan_op, identity, + acc_groups_, // acc_groups_ is modified in place + {dependent_event}); + stack.push_back({src, size_to_update, local_scans, src_size}); + src = local_scans; + local_scans += src_size * iter_nelems; + size_to_update = src_size; + } + + for (std::size_t reverse_stack_id = 0; + reverse_stack_id < stack.size() - 1; ++reverse_stack_id) { + const std::size_t stack_id = stack.size() - 1 - reverse_stack_id; + + const auto &stack_elem = stack[stack_id]; + outputT *src = stack_elem.get_src_ptr(); + std::size_t src_size = stack_elem.get_size(); + outputT *local_scans = stack_elem.get_local_scans_ptr(); + std::size_t local_stride = stack_elem.get_local_stride(); + + using UpdateKernelName = + class inclusive_scan_iter_chunk_update_krn; + + dependent_event = + update_local_chunks( + exec_q, iter_nelems, src, src_size, local_scans, chunk_size, + local_stride, dependent_event); + } + + // last stack element is always directly to output + { + const auto &stack_elem = stack[0]; + outputT *src = stack_elem.get_src_ptr(); + const std::size_t src_size = stack_elem.get_size(); + outputT *local_scans = stack_elem.get_local_scans_ptr(); + const std::size_t local_stride = stack_elem.get_local_stride(); + + using UpdateKernelName = + class inclusive_scan_final_chunk_update_krn< + outputT, n_wi, OutIterIndexerT, OutIndexerT, ScanOpT>; + + dependent_event = + final_update_local_chunks( + exec_q, iter_nelems, src, src_size, local_scans, chunk_size, + local_stride, out_iter_indexer, out_indexer, + dependent_event); + } + + sycl::event free_ev = dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {dependent_event}, temp_owner); + host_tasks.push_back(free_ev); + } + + return dependent_event; +} + +typedef sycl::event (*accumulate_strided_impl_fn_ptr_t)( + sycl::queue &, + std::size_t, + std::size_t, + const char *, + int, + const ssize_t *, + ssize_t, + ssize_t, + int, + const ssize_t *, + char *, + std::vector &, + const std::vector &); + +template +sycl::event + accumulate_strided_impl(sycl::queue &q, + std::size_t iter_nelems, + std::size_t acc_nelems, + const char *src, + int iter_nd, + const ssize_t *iter_shape_strides, + ssize_t inp_iter_offset, + ssize_t out_iter_offset, + int acc_nd, + const ssize_t *acc_shape_strides, + char *dst, + std::vector &host_tasks, + const std::vector &depends = {}) +{ + const srcT *src_data_ptr = reinterpret_cast(src); + dstT *dst_data_ptr = reinterpret_cast(dst); + + using InpIndexerT = dpctl::tensor::offset_utils::StridedIndexer; + const InpIndexerT inp_axis_indexer{acc_nd, 0, acc_shape_strides}; + const InpIndexerT inp_iter_indexer{iter_nd, inp_iter_offset, + iter_shape_strides}; + + using OutIndexerT = dpctl::tensor::offset_utils::UnpackedStridedIndexer; + const OutIndexerT out_axis_indexer{acc_nd, 0, acc_shape_strides, + acc_shape_strides + 2 * acc_nd}; + const OutIndexerT out_iter_indexer{iter_nd, out_iter_offset, + iter_shape_strides, + iter_shape_strides + 2 * iter_nd}; + + static constexpr transformerT transformer{}; + + static constexpr std::size_t s0 = 0; + static constexpr std::size_t s1 = 1; + + const sycl::device &dev = q.get_device(); + sycl::event comp_ev; + if (dev.has(sycl::aspect::cpu)) { + static constexpr nwiT n_wi_for_cpu = 8; + const std::uint32_t wg_size = 256; + comp_ev = + inclusive_scan_iter( + q, wg_size, iter_nelems, acc_nelems, src_data_ptr, dst_data_ptr, + s0, s1, inp_iter_indexer, out_iter_indexer, inp_axis_indexer, + out_axis_indexer, transformer, host_tasks, depends); + } + else { + static constexpr nwiT n_wi_for_gpu = 4; + // base_scan_striped algorithm does not execute correctly + // on HIP device with wg_size > 64 + const std::uint32_t wg_size = + (q.get_backend() == sycl::backend::ext_oneapi_hip) ? 64 : 256; + comp_ev = + inclusive_scan_iter( + q, wg_size, iter_nelems, acc_nelems, src_data_ptr, dst_data_ptr, + s0, s1, inp_iter_indexer, out_iter_indexer, inp_axis_indexer, + out_axis_indexer, transformer, host_tasks, depends); + } + + return comp_ev; +} + +typedef std::size_t (*cumsum_val_contig_impl_fn_ptr_t)( + sycl::queue &, + std::size_t, + const char *, + char *, + std::vector &, + const std::vector &); + +template +std::size_t cumsum_val_contig_impl(sycl::queue &q, + std::size_t n_elems, + const char *mask, + char *cumsum, + std::vector &host_tasks, + const std::vector &depends = {}) +{ + const maskT *mask_data_ptr = reinterpret_cast(mask); + cumsumT *cumsum_data_ptr = reinterpret_cast(cumsum); + + using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + static constexpr NoOpIndexerT flat_indexer{}; + static constexpr transformerT transformer{}; + + static constexpr std::size_t s0 = 0; + static constexpr std::size_t s1 = 1; + static constexpr bool include_initial = false; + using AccumulateOpT = sycl::plus; + + sycl::event comp_ev; + const sycl::device &dev = q.get_device(); + if (dev.has(sycl::aspect::cpu)) { + static constexpr nwiT n_wi_for_cpu = 8; + const std::uint32_t wg_size = 256; + comp_ev = inclusive_scan_iter_1d( + q, wg_size, n_elems, mask_data_ptr, cumsum_data_ptr, s0, s1, + flat_indexer, transformer, host_tasks, depends); + } + else { + static constexpr nwiT n_wi_for_gpu = 4; + // base_scan_striped algorithm does not execute correctly + // on HIP device with wg_size > 64 + const std::uint32_t wg_size = + (q.get_backend() == sycl::backend::ext_oneapi_hip) ? 64 : 256; + comp_ev = inclusive_scan_iter_1d( + q, wg_size, n_elems, mask_data_ptr, cumsum_data_ptr, s0, s1, + flat_indexer, transformer, host_tasks, depends); + } + cumsumT *last_elem = cumsum_data_ptr + (n_elems - 1); + + auto host_usm_owner = + dpctl::tensor::alloc_utils::smart_malloc_host(1, q); + cumsumT *last_elem_host_usm = host_usm_owner.get(); + + sycl::event copy_e = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(comp_ev); + cgh.copy(last_elem, last_elem_host_usm, 1); + }); + copy_e.wait(); + std::size_t return_val = static_cast(*last_elem_host_usm); + + // explicitly free USM host allocation, by invoking deleter of + // the unique_ptr + host_usm_owner.reset(nullptr); + + return return_val; +} + +template +struct MaskPositionsContigFactoryForInt32 +{ + fnT get() + { + using cumsumT = std::int32_t; + fnT fn = + cumsum_val_contig_impl>; + return fn; + } +}; + +template +struct MaskPositionsContigFactoryForInt64 +{ + fnT get() + { + using cumsumT = std::int64_t; + fnT fn = + cumsum_val_contig_impl>; + return fn; + } +}; + +template +struct Cumsum1DContigFactory +{ + fnT get() + { + if constexpr (std::is_integral_v) { + using cumsumT = std::int64_t; + fnT fn = + cumsum_val_contig_impl>; + return fn; + } + else { + return nullptr; + } + } +}; + +typedef std::size_t (*cumsum_val_strided_impl_fn_ptr_t)( + sycl::queue &, + std::size_t, + const char *, + int, + const ssize_t *, + char *, + std::vector &, + const std::vector &); + +template +std::size_t + cumsum_val_strided_impl(sycl::queue &q, + std::size_t n_elems, + const char *mask, + int nd, + const ssize_t *shape_strides, + char *cumsum, + std::vector &host_tasks, + const std::vector &depends = {}) +{ + const maskT *mask_data_ptr = reinterpret_cast(mask); + cumsumT *cumsum_data_ptr = reinterpret_cast(cumsum); + + using StridedIndexerT = dpctl::tensor::offset_utils::StridedIndexer; + const StridedIndexerT strided_indexer{nd, 0, shape_strides}; + static constexpr transformerT transformer{}; + + static constexpr std::size_t s0 = 0; + static constexpr std::size_t s1 = 1; + static constexpr bool include_initial = false; + using AccumulateOpT = sycl::plus; + + const sycl::device &dev = q.get_device(); + sycl::event comp_ev; + if (dev.has(sycl::aspect::cpu)) { + static constexpr nwiT n_wi_for_cpu = 8; + const std::uint32_t wg_size = 256; + comp_ev = inclusive_scan_iter_1d( + q, wg_size, n_elems, mask_data_ptr, cumsum_data_ptr, s0, s1, + strided_indexer, transformer, host_tasks, depends); + } + else { + static constexpr nwiT n_wi_for_gpu = 4; + // base_scan_striped algorithm does not execute correctly + // on HIP device with wg_size > 64 + const std::uint32_t wg_size = + (q.get_backend() == sycl::backend::ext_oneapi_hip) ? 64 : 256; + comp_ev = inclusive_scan_iter_1d( + q, wg_size, n_elems, mask_data_ptr, cumsum_data_ptr, s0, s1, + strided_indexer, transformer, host_tasks, depends); + } + + cumsumT *last_elem = cumsum_data_ptr + (n_elems - 1); + + auto host_usm_owner = + dpctl::tensor::alloc_utils::smart_malloc_host(1, q); + cumsumT *last_elem_host_usm = host_usm_owner.get(); + + sycl::event copy_e = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(comp_ev); + cgh.copy(last_elem, last_elem_host_usm, 1); + }); + copy_e.wait(); + std::size_t return_val = static_cast(*last_elem_host_usm); + + // explicitly free USM-host temporary, by invoking deleter of + // the unique_ptr + host_usm_owner.reset(nullptr); + + return return_val; +} + +template +struct MaskPositionsStridedFactoryForInt32 +{ + fnT get() + { + using cumsumT = std::int32_t; + fnT fn = + cumsum_val_strided_impl>; + return fn; + } +}; + +template +struct MaskPositionsStridedFactoryForInt64 +{ + fnT get() + { + using cumsumT = std::int64_t; + fnT fn = + cumsum_val_strided_impl>; + return fn; + } +}; + +template +struct Cumsum1DStridedFactory +{ + fnT get() + { + if constexpr (std::is_integral_v) { + using cumsumT = std::int64_t; + fnT fn = + cumsum_val_strided_impl>; + return fn; + } + else { + return nullptr; + } + } +}; + +} // namespace dpctl::tensor::kernels::accumulators diff --git a/dpnp/tensor/libtensor/include/kernels/alignment.hpp b/dpnp/tensor/libtensor/include/kernels/alignment.hpp new file mode 100644 index 000000000000..a67e9b15306e --- /dev/null +++ b/dpnp/tensor/libtensor/include/kernels/alignment.hpp @@ -0,0 +1,46 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** + +#pragma once + +#include +#include + +namespace dpctl::tensor::kernels::alignment_utils +{ +inline constexpr std::size_t required_alignment = 64UL; + +template +bool is_aligned(Ptr p) +{ + return !(reinterpret_cast(p) % alignment); +} + +template +class disabled_sg_loadstore_wrapper_krn; +} // namespace dpctl::tensor::kernels::alignment_utils diff --git a/dpnp/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp b/dpnp/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp new file mode 100644 index 000000000000..046ad87d7d78 --- /dev/null +++ b/dpnp/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp @@ -0,0 +1,853 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for advanced tensor index operations. +//===---------------------------------------------------------------------===// + +#pragma once +#include +#include +#include +#include +#include + +#include + +#include "dpctl_tensor_types.hpp" +#include "utils/offset_utils.hpp" +#include "utils/type_dispatch_building.hpp" + +namespace dpctl::tensor::kernels::indexing +{ + +using dpctl::tensor::ssize_t; +using namespace dpctl::tensor::offset_utils; + +template +struct MaskedExtractStridedFunctor +{ + MaskedExtractStridedFunctor(const dataT *src_data_p, + const indT *cumsum_data_p, + dataT *dst_data_p, + std::size_t masked_iter_size, + const OrthogIndexerT &orthog_src_dst_indexer_, + const MaskedSrcIndexerT &masked_src_indexer_, + const MaskedDstIndexerT &masked_dst_indexer_, + const LocalAccessorT &lacc_) + : src(src_data_p), cumsum(cumsum_data_p), dst(dst_data_p), + masked_nelems(masked_iter_size), + orthog_src_dst_indexer(orthog_src_dst_indexer_), + masked_src_indexer(masked_src_indexer_), + masked_dst_indexer(masked_dst_indexer_), lacc(lacc_) + { + static_assert( + std::is_same_v); + } + + void operator()(sycl::nd_item<2> ndit) const + { + const std::size_t orthog_i = ndit.get_global_id(0); + const std::uint32_t l_i = ndit.get_local_id(1); + const std::uint32_t lws = ndit.get_local_range(1); + + const std::size_t masked_i = ndit.get_global_id(1); + const std::size_t masked_block_start = masked_i - l_i; + + const std::size_t max_offset = masked_nelems + 1; + for (std::uint32_t i = l_i; i < lacc.size(); i += lws) { + const std::size_t offset = masked_block_start + i; + lacc[i] = (offset == 0) ? indT(0) + : (offset < max_offset) ? cumsum[offset - 1] + : cumsum[masked_nelems - 1] + 1; + } + + sycl::group_barrier(ndit.get_group()); + + const indT current_running_count = lacc[l_i + 1]; + const bool mask_set = (masked_i == 0) + ? (current_running_count == 1) + : (current_running_count == lacc[l_i] + 1); + + // dst[cumsum[i] - 1, j] = src[i, j] + // if cumsum[i] == ((i > 0) ? cumsum[i-1] + 1 : 1) + if (mask_set && (masked_i < masked_nelems)) { + const auto &orthog_offsets = orthog_src_dst_indexer(orthog_i); + + const std::size_t total_src_offset = + masked_src_indexer(masked_i) + + orthog_offsets.get_first_offset(); + const std::size_t total_dst_offset = + masked_dst_indexer(current_running_count - 1) + + orthog_offsets.get_second_offset(); + + dst[total_dst_offset] = src[total_src_offset]; + } + } + +private: + const dataT *src = nullptr; + const indT *cumsum = nullptr; + dataT *dst = nullptr; + std::size_t masked_nelems = 0; + // has nd, shape, src_strides, dst_strides for + // dimensions that ARE NOT masked + OrthogIndexerT orthog_src_dst_indexer; + // has nd, shape, src_strides for + // dimensions that ARE masked + MaskedSrcIndexerT masked_src_indexer; + // has 1, dst_strides for dimensions that ARE masked + MaskedDstIndexerT masked_dst_indexer; + LocalAccessorT lacc; +}; + +template +struct MaskedPlaceStridedFunctor +{ + MaskedPlaceStridedFunctor(dataT *dst_data_p, + const indT *cumsum_data_p, + const dataT *rhs_data_p, + std::size_t masked_iter_size, + const OrthogIndexerT &orthog_dst_rhs_indexer_, + const MaskedDstIndexerT &masked_dst_indexer_, + const MaskedRhsIndexerT &masked_rhs_indexer_, + const LocalAccessorT &lacc_) + : dst(dst_data_p), cumsum(cumsum_data_p), rhs(rhs_data_p), + masked_nelems(masked_iter_size), + orthog_dst_rhs_indexer(orthog_dst_rhs_indexer_), + masked_dst_indexer(masked_dst_indexer_), + masked_rhs_indexer(masked_rhs_indexer_), lacc(lacc_) + { + static_assert( + std::is_same_v); + } + + void operator()(sycl::nd_item<2> ndit) const + { + const std::size_t orthog_i = ndit.get_global_id(0); + const std::uint32_t l_i = ndit.get_local_id(1); + const std::uint32_t lws = ndit.get_local_range(1); + + const std::size_t masked_i = ndit.get_global_id(1); + const std::size_t masked_block_start = masked_i - l_i; + + const std::size_t max_offset = masked_nelems + 1; + for (std::uint32_t i = l_i; i < lacc.size(); i += lws) { + const std::size_t offset = masked_block_start + i; + lacc[i] = (offset == 0) ? indT(0) + : (offset < max_offset) ? cumsum[offset - 1] + : cumsum[masked_nelems - 1] + 1; + } + + sycl::group_barrier(ndit.get_group()); + + const indT current_running_count = lacc[l_i + 1]; + const bool mask_set = (masked_i == 0) + ? (current_running_count == 1) + : (current_running_count == lacc[l_i] + 1); + + // src[i, j] = rhs[cumsum[i] - 1, j] + // if cumsum[i] == ((i > 0) ? cumsum[i-1] + 1 : 1) + if (mask_set && (masked_i < masked_nelems)) { + const auto &orthog_offsets = orthog_dst_rhs_indexer(orthog_i); + + const std::size_t total_dst_offset = + masked_dst_indexer(masked_i) + + orthog_offsets.get_first_offset(); + const std::size_t total_rhs_offset = + masked_rhs_indexer(current_running_count - 1) + + orthog_offsets.get_second_offset(); + + dst[total_dst_offset] = rhs[total_rhs_offset]; + } + } + +private: + dataT *dst = nullptr; + const indT *cumsum = nullptr; + const dataT *rhs = nullptr; + std::size_t masked_nelems = 0; + // has nd, shape, dst_strides, rhs_strides for + // dimensions that ARE NOT masked + OrthogIndexerT orthog_dst_rhs_indexer; + // has nd, shape, dst_strides for + // dimensions that ARE masked + MaskedDstIndexerT masked_dst_indexer; + // has 1, rhs_strides for dimensions that ARE masked + MaskedRhsIndexerT masked_rhs_indexer; + LocalAccessorT lacc; +}; + +// ======= Masked extraction ================================ + +namespace detail +{ + +template +std::size_t _get_lws_impl(std::size_t n) +{ + if constexpr (sizeof...(IR) == 0) { + return I; + } + else { + return (n < I) ? _get_lws_impl(n) : I; + } +} + +inline std::size_t get_lws(std::size_t n) +{ + static constexpr std::size_t lws0 = 256u; + static constexpr std::size_t lws1 = 128u; + static constexpr std::size_t lws2 = 64u; + return _get_lws_impl(n); +} + +} // end of namespace detail + +template +class masked_extract_all_slices_contig_impl_krn; + +typedef sycl::event (*masked_extract_all_slices_contig_impl_fn_ptr_t)( + sycl::queue &, + ssize_t, + const char *, + const char *, + char *, + ssize_t, + ssize_t, + const std::vector &); + +template +sycl::event masked_extract_all_slices_contig_impl( + sycl::queue &exec_q, + ssize_t iteration_size, + const char *src_p, + const char *cumsum_p, + char *dst_p, + ssize_t dst_size, // dst is 1D + ssize_t dst_stride, + const std::vector &depends = {}) +{ + static constexpr TwoZeroOffsets_Indexer orthog_src_dst_indexer{}; + + static constexpr NoOpIndexer masked_src_indexer{}; + const Strided1DIndexer masked_dst_indexer(/* size */ dst_size, + /* step */ dst_stride); + + using KernelName = + class masked_extract_all_slices_contig_impl_krn; + + using LocalAccessorT = sycl::local_accessor; + using Impl = + struct MaskedExtractStridedFunctor; + + const std::size_t masked_extent = iteration_size; + + const std::size_t lws = detail::get_lws(masked_extent); + + const std::size_t n_groups = (iteration_size + lws - 1) / lws; + + sycl::range<2> gRange{1, n_groups * lws}; + sycl::range<2> lRange{1, lws}; + + sycl::nd_range<2> ndRange(gRange, lRange); + + const dataT *src_tp = reinterpret_cast(src_p); + const indT *cumsum_tp = reinterpret_cast(cumsum_p); + dataT *dst_tp = reinterpret_cast(dst_p); + + sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + const std::size_t lacc_size = std::min(lws, masked_extent) + 1; + LocalAccessorT lacc(lacc_size, cgh); + + cgh.parallel_for( + ndRange, Impl(src_tp, cumsum_tp, dst_tp, masked_extent, + orthog_src_dst_indexer, masked_src_indexer, + masked_dst_indexer, lacc)); + }); + + return comp_ev; +} + +template +class masked_extract_all_slices_strided_impl_krn; + +typedef sycl::event (*masked_extract_all_slices_strided_impl_fn_ptr_t)( + sycl::queue &, + ssize_t, + const char *, + const char *, + char *, + int, + ssize_t const *, + ssize_t, + ssize_t, + const std::vector &); + +template +sycl::event masked_extract_all_slices_strided_impl( + sycl::queue &exec_q, + ssize_t iteration_size, + const char *src_p, + const char *cumsum_p, + char *dst_p, + int nd, + const ssize_t + *packed_src_shape_strides, // [src_shape, src_strides], length 2*nd + ssize_t dst_size, // dst is 1D + ssize_t dst_stride, + const std::vector &depends = {}) +{ + static constexpr TwoZeroOffsets_Indexer orthog_src_dst_indexer{}; + + /* StridedIndexer(int _nd, ssize_t _offset, ssize_t const + * *_packed_shape_strides) */ + const StridedIndexer masked_src_indexer(nd, 0, packed_src_shape_strides); + const Strided1DIndexer masked_dst_indexer(/* size */ dst_size, + /* step */ dst_stride); + + using KernelName = class masked_extract_all_slices_strided_impl_krn< + StridedIndexer, Strided1DIndexer, dataT, indT>; + + using LocalAccessorT = sycl::local_accessor; + using Impl = + struct MaskedExtractStridedFunctor; + + const std::size_t masked_nelems = iteration_size; + + const std::size_t lws = detail::get_lws(masked_nelems); + + const std::size_t n_groups = (masked_nelems + lws - 1) / lws; + + sycl::range<2> gRange{1, n_groups * lws}; + sycl::range<2> lRange{1, lws}; + + sycl::nd_range<2> ndRange(gRange, lRange); + + const dataT *src_tp = reinterpret_cast(src_p); + const indT *cumsum_tp = reinterpret_cast(cumsum_p); + dataT *dst_tp = reinterpret_cast(dst_p); + + sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + const std::size_t lacc_size = std::min(lws, masked_nelems) + 1; + LocalAccessorT lacc(lacc_size, cgh); + + cgh.parallel_for( + ndRange, Impl(src_tp, cumsum_tp, dst_tp, iteration_size, + orthog_src_dst_indexer, masked_src_indexer, + masked_dst_indexer, lacc)); + }); + + return comp_ev; +} + +typedef sycl::event (*masked_extract_some_slices_strided_impl_fn_ptr_t)( + sycl::queue &, + ssize_t, + ssize_t, + const char *, + const char *, + char *, + int, + ssize_t const *, + ssize_t, + ssize_t, + int, + ssize_t const *, + ssize_t, + ssize_t, + const std::vector &); + +template +class masked_extract_some_slices_strided_impl_krn; + +template +sycl::event masked_extract_some_slices_strided_impl( + sycl::queue &exec_q, + ssize_t orthog_nelems, + ssize_t masked_nelems, + const char *src_p, + const char *cumsum_p, + char *dst_p, + int orthog_nd, + // [ortho_shape, ortho_src_strides, // ortho_dst_strides], + // length 3*ortho_nd + const ssize_t *packed_ortho_src_dst_shape_strides, + ssize_t ortho_src_offset, + ssize_t ortho_dst_offset, + int masked_nd, + // [masked_src_shape, masked_src_strides], + // length 2*masked_nd, mask_dst is 1D + const ssize_t *packed_masked_src_shape_strides, + ssize_t masked_dst_size, + ssize_t masked_dst_stride, + const std::vector &depends = {}) +{ + const TwoOffsets_StridedIndexer orthog_src_dst_indexer{ + orthog_nd, ortho_src_offset, ortho_dst_offset, + packed_ortho_src_dst_shape_strides}; + + const StridedIndexer masked_src_indexer{masked_nd, 0, + packed_masked_src_shape_strides}; + const Strided1DIndexer masked_dst_indexer{/* size */ masked_dst_size, + /* step */ masked_dst_stride}; + + using KernelName = class masked_extract_some_slices_strided_impl_krn< + TwoOffsets_StridedIndexer, StridedIndexer, Strided1DIndexer, dataT, + indT>; + + using LocalAccessorT = sycl::local_accessor; + using Impl = + struct MaskedExtractStridedFunctor; + + const std::size_t masked_extent = masked_nelems; + + const std::size_t lws = detail::get_lws(masked_extent); + + const std::size_t n_groups = ((masked_extent + lws - 1) / lws); + const std::size_t orthog_extent = static_cast(orthog_nelems); + + sycl::range<2> gRange{orthog_extent, n_groups * lws}; + sycl::range<2> lRange{1, lws}; + + sycl::nd_range<2> ndRange(gRange, lRange); + + const dataT *src_tp = reinterpret_cast(src_p); + const indT *cumsum_tp = reinterpret_cast(cumsum_p); + dataT *dst_tp = reinterpret_cast(dst_p); + + sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + const std::size_t lacc_size = + std::min(lws, masked_extent) + 1; + LocalAccessorT lacc(lacc_size, cgh); + + cgh.parallel_for( + ndRange, Impl(src_tp, cumsum_tp, dst_tp, masked_nelems, + orthog_src_dst_indexer, masked_src_indexer, + masked_dst_indexer, lacc)); + }); + + return comp_ev; +} + +template +struct MaskExtractAllSlicesContigFactoryForInt32 +{ + fnT get() + { + fnT fn = masked_extract_all_slices_contig_impl; + return fn; + } +}; + +template +struct MaskExtractAllSlicesContigFactoryForInt64 +{ + fnT get() + { + fnT fn = masked_extract_all_slices_contig_impl; + return fn; + } +}; + +template +struct MaskExtractAllSlicesStridedFactoryForInt32 +{ + fnT get() + { + fnT fn = masked_extract_all_slices_strided_impl; + return fn; + } +}; + +template +struct MaskExtractAllSlicesStridedFactoryForInt64 +{ + fnT get() + { + fnT fn = masked_extract_all_slices_strided_impl; + return fn; + } +}; + +template +struct MaskExtractSomeSlicesStridedFactoryForInt32 +{ + fnT get() + { + fnT fn = masked_extract_some_slices_strided_impl; + return fn; + } +}; + +template +struct MaskExtractSomeSlicesStridedFactoryForInt64 +{ + fnT get() + { + fnT fn = masked_extract_some_slices_strided_impl; + return fn; + } +}; + +// Masked placement + +template +class masked_place_all_slices_strided_impl_krn; + +typedef sycl::event (*masked_place_all_slices_strided_impl_fn_ptr_t)( + sycl::queue &, + ssize_t, + char *, + const char *, + const char *, + int, + ssize_t const *, + ssize_t, + ssize_t, + const std::vector &); + +template +sycl::event masked_place_all_slices_strided_impl( + sycl::queue &exec_q, + ssize_t iteration_size, + char *dst_p, + const char *cumsum_p, + const char *rhs_p, + int nd, + const ssize_t + *packed_dst_shape_strides, // [dst_shape, dst_strides], length 2*nd + ssize_t rhs_size, // rhs is 1D + ssize_t rhs_stride, + const std::vector &depends = {}) +{ + static constexpr TwoZeroOffsets_Indexer orthog_dst_rhs_indexer{}; + + /* StridedIndexer(int _nd, ssize_t _offset, ssize_t const + * *_packed_shape_strides) */ + const StridedIndexer masked_dst_indexer(nd, 0, packed_dst_shape_strides); + const Strided1DCyclicIndexer masked_rhs_indexer(0, rhs_size, rhs_stride); + + using KernelName = class masked_place_all_slices_strided_impl_krn< + TwoZeroOffsets_Indexer, StridedIndexer, Strided1DCyclicIndexer, dataT, + indT>; + + static constexpr std::size_t nominal_lws = 256; + const std::size_t masked_extent = iteration_size; + const std::size_t lws = std::min(masked_extent, nominal_lws); + + const std::size_t n_groups = (masked_extent + lws - 1) / lws; + + sycl::range<2> gRange{1, n_groups * lws}; + sycl::range<2> lRange{1, lws}; + sycl::nd_range<2> ndRange{gRange, lRange}; + + using LocalAccessorT = sycl::local_accessor; + using Impl = + MaskedPlaceStridedFunctor; + + dataT *dst_tp = reinterpret_cast(dst_p); + const dataT *rhs_tp = reinterpret_cast(rhs_p); + const indT *cumsum_tp = reinterpret_cast(cumsum_p); + + sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + const std::size_t lacc_size = std::min(masked_extent, lws) + 1; + LocalAccessorT lacc(lacc_size, cgh); + + cgh.parallel_for( + ndRange, Impl(dst_tp, cumsum_tp, rhs_tp, iteration_size, + orthog_dst_rhs_indexer, masked_dst_indexer, + masked_rhs_indexer, lacc)); + }); + + return comp_ev; +} + +typedef sycl::event (*masked_place_some_slices_strided_impl_fn_ptr_t)( + sycl::queue &, + ssize_t, + ssize_t, + char *, + const char *, + const char *, + int, + ssize_t const *, + ssize_t, + ssize_t, + int, + ssize_t const *, + ssize_t, + ssize_t, + const std::vector &); + +template +class masked_place_some_slices_strided_impl_krn; + +template +sycl::event masked_place_some_slices_strided_impl( + sycl::queue &exec_q, + ssize_t orthog_nelems, + ssize_t masked_nelems, + char *dst_p, + const char *cumsum_p, + const char *rhs_p, + int orthog_nd, + // [ortho_shape, ortho_dst_strides, ortho_rhs_strides], + // length 3*ortho_nd + const ssize_t *packed_ortho_dst_rhs_shape_strides, + ssize_t ortho_dst_offset, + ssize_t ortho_rhs_offset, + int masked_nd, + // [masked_dst_shape, masked_dst_strides], + // length 2*masked_nd, mask_dst is 1D + const ssize_t *packed_masked_dst_shape_strides, + ssize_t masked_rhs_size, + ssize_t masked_rhs_stride, + const std::vector &depends = {}) +{ + const TwoOffsets_StridedIndexer orthog_dst_rhs_indexer{ + orthog_nd, ortho_dst_offset, ortho_rhs_offset, + packed_ortho_dst_rhs_shape_strides}; + + /* StridedIndexer(int _nd, ssize_t _offset, ssize_t const + * *_packed_shape_strides) */ + const StridedIndexer masked_dst_indexer{masked_nd, 0, + packed_masked_dst_shape_strides}; + const Strided1DCyclicIndexer masked_rhs_indexer{0, masked_rhs_size, + masked_rhs_stride}; + + using KernelName = class masked_place_some_slices_strided_impl_krn< + TwoOffsets_StridedIndexer, StridedIndexer, Strided1DCyclicIndexer, + dataT, indT>; + + static constexpr std::size_t nominal_lws = 256; + const std::size_t orthog_extent = orthog_nelems; + const std::size_t masked_extent = masked_nelems; + const std::size_t lws = std::min(masked_extent, nominal_lws); + + const std::size_t n_groups = (masked_extent + lws - 1) / lws; + + sycl::range<2> gRange{orthog_extent, n_groups * lws}; + sycl::range<2> lRange{1, lws}; + sycl::nd_range<2> ndRange{gRange, lRange}; + + using LocalAccessorT = sycl::local_accessor; + using Impl = + MaskedPlaceStridedFunctor; + + dataT *dst_tp = reinterpret_cast(dst_p); + const dataT *rhs_tp = reinterpret_cast(rhs_p); + const indT *cumsum_tp = reinterpret_cast(cumsum_p); + + sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + const std::size_t lacc_size = std::min(masked_extent, lws) + 1; + LocalAccessorT lacc(lacc_size, cgh); + + cgh.parallel_for( + ndRange, Impl(dst_tp, cumsum_tp, rhs_tp, masked_nelems, + orthog_dst_rhs_indexer, masked_dst_indexer, + masked_rhs_indexer, lacc)); + }); + + return comp_ev; +} + +template +struct MaskPlaceAllSlicesStridedFactoryForInt32 +{ + fnT get() + { + fnT fn = masked_place_all_slices_strided_impl; + return fn; + } +}; + +template +struct MaskPlaceAllSlicesStridedFactoryForInt64 +{ + fnT get() + { + fnT fn = masked_place_all_slices_strided_impl; + return fn; + } +}; + +template +struct MaskPlaceSomeSlicesStridedFactoryForInt32 +{ + fnT get() + { + fnT fn = masked_place_some_slices_strided_impl; + return fn; + } +}; + +template +struct MaskPlaceSomeSlicesStridedFactoryForInt64 +{ + fnT get() + { + fnT fn = masked_place_some_slices_strided_impl; + return fn; + } +}; + +// Non-zero + +template +class non_zero_indexes_krn; + +typedef sycl::event (*non_zero_indexes_fn_ptr_t)( + sycl::queue &, + ssize_t, + ssize_t, + int, + const char *, + char *, + const ssize_t *, + std::vector const &); + +template +sycl::event non_zero_indexes_impl(sycl::queue &exec_q, + ssize_t iter_size, + ssize_t nz_elems, + int nd, + const char *cumsum_cp, + char *indexes_cp, + const ssize_t *mask_shape, + std::vector const &depends) +{ + const indT1 *cumsum_data = reinterpret_cast(cumsum_cp); + indT2 *indexes_data = reinterpret_cast(indexes_cp); + + static constexpr std::size_t nominal_lws = 256u; + const std::size_t masked_extent = iter_size; + const std::size_t lws = std::min(masked_extent, nominal_lws); + + const std::size_t n_groups = (masked_extent + lws - 1) / lws; + sycl::range<1> gRange{n_groups * lws}; + sycl::range<1> lRange{lws}; + + sycl::nd_range<1> ndRange{gRange, lRange}; + + sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + const std::size_t lacc_size = std::min(lws, masked_extent) + 1; + sycl::local_accessor lacc(lacc_size, cgh); + + using KernelName = class non_zero_indexes_krn; + + cgh.parallel_for(ndRange, [=](sycl::nd_item<1> ndit) { + const std::size_t group_i = ndit.get_group(0); + const std::uint32_t l_i = ndit.get_local_id(0); + const std::uint32_t lws = ndit.get_local_range(0); + + const std::size_t masked_block_start = group_i * lws; + + for (std::uint32_t i = l_i; i < lacc.size(); i += lws) { + const std::size_t offset = masked_block_start + i; + lacc[i] = (offset == 0) ? indT1(0) + : (offset - 1 < masked_extent) + ? cumsum_data[offset - 1] + : cumsum_data[masked_extent - 1] + 1; + } + + sycl::group_barrier(ndit.get_group()); + + const std::size_t i = masked_block_start + l_i; + const auto cs_val = lacc[l_i]; + const bool cond = (lacc[l_i + 1] == cs_val + 1); + + if (cond && (i < masked_extent)) { + ssize_t i_ = static_cast(i); + for (int dim = nd; --dim > 0;) { + const auto sd = mask_shape[dim]; + const ssize_t q = i_ / sd; + const ssize_t r = (i_ - q * sd); + indexes_data[cs_val + dim * nz_elems] = + static_cast(r); + i_ = q; + } + indexes_data[cs_val] = static_cast(i_); + } + }); + }); + + return comp_ev; +} + +} // namespace dpctl::tensor::kernels::indexing diff --git a/dpnp/tensor/libtensor/include/kernels/clip.hpp b/dpnp/tensor/libtensor/include/kernels/clip.hpp new file mode 100644 index 000000000000..900fcf3df100 --- /dev/null +++ b/dpnp/tensor/libtensor/include/kernels/clip.hpp @@ -0,0 +1,356 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for dpctl.tensor.clip. +//===---------------------------------------------------------------------===// + +#pragma once +#include +#include +#include +#include +#include +#include + +#include + +#include "dpctl_tensor_types.hpp" +#include "kernels/alignment.hpp" +#include "utils/math_utils.hpp" +#include "utils/offset_utils.hpp" +#include "utils/sycl_utils.hpp" +#include "utils/type_utils.hpp" + +namespace dpctl::tensor::kernels::clip +{ + +using dpctl::tensor::ssize_t; +using namespace dpctl::tensor::offset_utils; + +using dpctl::tensor::kernels::alignment_utils:: + disabled_sg_loadstore_wrapper_krn; +using dpctl::tensor::kernels::alignment_utils::is_aligned; +using dpctl::tensor::kernels::alignment_utils::required_alignment; + +using dpctl::tensor::sycl_utils::sub_group_load; +using dpctl::tensor::sycl_utils::sub_group_store; + +template +T clip(const T &x, const T &min, const T &max) +{ + using dpctl::tensor::type_utils::is_complex; + if constexpr (is_complex::value) { + using dpctl::tensor::math_utils::max_complex; + using dpctl::tensor::math_utils::min_complex; + return min_complex(max_complex(x, min), max); + } + else if constexpr (std::is_floating_point_v || + std::is_same_v) { + auto tmp = (std::isnan(x) || x > min) ? x : min; + return (std::isnan(tmp) || tmp < max) ? tmp : max; + } + else if constexpr (std::is_same_v) { + return (x || min) && max; + } + else { + auto tmp = (x > min) ? x : min; + return (tmp < max) ? tmp : max; + } +} + +template +class ClipContigFunctor +{ +private: + std::size_t nelems = 0; + const T *x_p = nullptr; + const T *min_p = nullptr; + const T *max_p = nullptr; + T *dst_p = nullptr; + +public: + ClipContigFunctor(std::size_t nelems_, + const T *x_p_, + const T *min_p_, + const T *max_p_, + T *dst_p_) + : nelems(nelems_), x_p(x_p_), min_p(min_p_), max_p(max_p_), + dst_p(dst_p_) + { + } + + void operator()(sycl::nd_item<1> ndit) const + { + static constexpr std::uint8_t nelems_per_wi = n_vecs * vec_sz; + + using dpctl::tensor::type_utils::is_complex; + if constexpr (is_complex::value || !enable_sg_loadstore) { + const std::uint16_t sgSize = + ndit.get_sub_group().get_local_range()[0]; + const std::size_t gid = ndit.get_global_linear_id(); + const std::uint16_t nelems_per_sg = sgSize * nelems_per_wi; + + const std::size_t start = + (gid / sgSize) * (nelems_per_sg - sgSize) + gid; + const std::size_t end = std::min(nelems, start + nelems_per_sg); + + for (std::size_t offset = start; offset < end; offset += sgSize) { + dst_p[offset] = clip(x_p[offset], min_p[offset], max_p[offset]); + } + } + else { + auto sg = ndit.get_sub_group(); + const std::uint16_t sgSize = sg.get_max_local_range()[0]; + + const std::size_t base = + nelems_per_wi * (ndit.get_group(0) * ndit.get_local_range(0) + + sg.get_group_id()[0] * sgSize); + + if (base + nelems_per_wi * sgSize < nelems) { + sycl::vec dst_vec; +#pragma unroll + for (std::uint8_t it = 0; it < n_vecs * vec_sz; it += vec_sz) { + const std::size_t idx = base + it * sgSize; + auto x_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>(&x_p[idx]); + auto min_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>(&min_p[idx]); + auto max_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>(&max_p[idx]); + auto dst_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>(&dst_p[idx]); + + const sycl::vec x_vec = + sub_group_load(sg, x_multi_ptr); + const sycl::vec min_vec = + sub_group_load(sg, min_multi_ptr); + const sycl::vec max_vec = + sub_group_load(sg, max_multi_ptr); +#pragma unroll + for (std::uint8_t vec_id = 0; vec_id < vec_sz; ++vec_id) { + dst_vec[vec_id] = clip(x_vec[vec_id], min_vec[vec_id], + max_vec[vec_id]); + } + sub_group_store(sg, dst_vec, dst_multi_ptr); + } + } + else { + const std::size_t lane_id = sg.get_local_id()[0]; + for (std::size_t k = base + lane_id; k < nelems; k += sgSize) { + dst_p[k] = clip(x_p[k], min_p[k], max_p[k]); + } + } + } + } +}; + +template +class clip_contig_kernel; + +typedef sycl::event (*clip_contig_impl_fn_ptr_t)( + sycl::queue &, + std::size_t, + const char *, + const char *, + const char *, + char *, + const std::vector &); + +template +sycl::event clip_contig_impl(sycl::queue &q, + std::size_t nelems, + const char *x_cp, + const char *min_cp, + const char *max_cp, + char *dst_cp, + const std::vector &depends) +{ + const T *x_tp = reinterpret_cast(x_cp); + const T *min_tp = reinterpret_cast(min_cp); + const T *max_tp = reinterpret_cast(max_cp); + T *dst_tp = reinterpret_cast(dst_cp); + + sycl::event clip_ev = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + std::size_t lws = 64; + static constexpr std::uint8_t vec_sz = 4; + static constexpr std::uint8_t n_vecs = 2; + const std::size_t n_groups = + ((nelems + lws * n_vecs * vec_sz - 1) / (lws * n_vecs * vec_sz)); + const auto gws_range = sycl::range<1>(n_groups * lws); + const auto lws_range = sycl::range<1>(lws); + + if (is_aligned(x_cp) && + is_aligned(min_cp) && + is_aligned(max_cp) && + is_aligned(dst_cp)) { + static constexpr bool enable_sg_loadstore = true; + using KernelName = clip_contig_kernel; + using Impl = + ClipContigFunctor; + + cgh.parallel_for( + sycl::nd_range<1>(gws_range, lws_range), + Impl(nelems, x_tp, min_tp, max_tp, dst_tp)); + } + else { + static constexpr bool disable_sg_loadstore = false; + using InnerKernelName = clip_contig_kernel; + using KernelName = + disabled_sg_loadstore_wrapper_krn; + using Impl = + ClipContigFunctor; + + cgh.parallel_for( + sycl::nd_range<1>(gws_range, lws_range), + Impl(nelems, x_tp, min_tp, max_tp, dst_tp)); + } + }); + + return clip_ev; +} + +template +class ClipStridedFunctor +{ +private: + const T *x_p = nullptr; + const T *min_p = nullptr; + const T *max_p = nullptr; + T *dst_p = nullptr; + IndexerT indexer; + +public: + ClipStridedFunctor(const T *x_p_, + const T *min_p_, + const T *max_p_, + T *dst_p_, + const IndexerT &indexer_) + : x_p(x_p_), min_p(min_p_), max_p(max_p_), dst_p(dst_p_), + indexer(indexer_) + { + } + + void operator()(sycl::id<1> id) const + { + std::size_t gid = id[0]; + auto offsets = indexer(static_cast(gid)); + dst_p[offsets.get_fourth_offset()] = clip( + x_p[offsets.get_first_offset()], min_p[offsets.get_second_offset()], + max_p[offsets.get_third_offset()]); + } +}; + +template +class clip_strided_kernel; + +typedef sycl::event (*clip_strided_impl_fn_ptr_t)( + sycl::queue &, + std::size_t, + int, + const char *, + const char *, + const char *, + char *, + const ssize_t *, + ssize_t, + ssize_t, + ssize_t, + ssize_t, + const std::vector &); + +template +sycl::event clip_strided_impl(sycl::queue &q, + std::size_t nelems, + int nd, + const char *x_cp, + const char *min_cp, + const char *max_cp, + char *dst_cp, + const ssize_t *shape_strides, + ssize_t x_offset, + ssize_t min_offset, + ssize_t max_offset, + ssize_t dst_offset, + const std::vector &depends) +{ + const T *x_tp = reinterpret_cast(x_cp); + const T *min_tp = reinterpret_cast(min_cp); + const T *max_tp = reinterpret_cast(max_cp); + T *dst_tp = reinterpret_cast(dst_cp); + + sycl::event clip_ev = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + const FourOffsets_StridedIndexer indexer{ + nd, x_offset, min_offset, max_offset, dst_offset, shape_strides}; + + using KernelName = clip_strided_kernel; + using Impl = ClipStridedFunctor; + + cgh.parallel_for( + sycl::range<1>(nelems), + Impl(x_tp, min_tp, max_tp, dst_tp, indexer)); + }); + + return clip_ev; +} + +template +struct ClipStridedFactory +{ + fnT get() + { + fnT fn = clip_strided_impl; + return fn; + } +}; + +template +struct ClipContigFactory +{ + fnT get() + { + + fnT fn = clip_contig_impl; + return fn; + } +}; + +} // namespace dpctl::tensor::kernels::clip diff --git a/dpnp/tensor/libtensor/include/kernels/constructors.hpp b/dpnp/tensor/libtensor/include/kernels/constructors.hpp new file mode 100644 index 000000000000..67f2502067ca --- /dev/null +++ b/dpnp/tensor/libtensor/include/kernels/constructors.hpp @@ -0,0 +1,575 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for tensor constructors. +//===----------------------------------------------------------------------===// + +#pragma once + +#include +#include +#include +#include + +#include + +#include "dpctl_tensor_types.hpp" +#include "utils/offset_utils.hpp" +#include "utils/strided_iters.hpp" +#include "utils/type_utils.hpp" + +namespace dpctl::tensor::kernels::constructors +{ + +using dpctl::tensor::ssize_t; + +/*! + @defgroup CtorKernels + */ + +template +class linear_sequence_step_kernel; +template +class linear_sequence_affine_kernel; +template +class full_strided_kernel; +template +class eye_kernel; + +using namespace dpctl::tensor::offset_utils; + +template +class LinearSequenceStepFunctor +{ +private: + Ty *p = nullptr; + Ty start_v; + Ty step_v; + +public: + LinearSequenceStepFunctor(char *dst_p, Ty v0, Ty dv) + : p(reinterpret_cast(dst_p)), start_v(v0), step_v(dv) + { + } + + void operator()(sycl::id<1> wiid) const + { + auto i = wiid.get(0); + using dpctl::tensor::type_utils::is_complex; + if constexpr (is_complex::value) { + p[i] = Ty{start_v.real() + i * step_v.real(), + start_v.imag() + i * step_v.imag()}; + } + else { + p[i] = start_v + i * step_v; + } + } +}; + +/*! + * @brief Function to submit kernel to populate given contiguous memory + * allocation with linear sequence specified by typed starting value and + * increment. + * + * @param q Sycl queue to which the kernel is submitted + * @param nelems Length of the sequence + * @param start_v Typed starting value of the sequence + * @param step_v Typed increment of the sequence + * @param array_data Kernel accessible USM pointer to the start of array to be + * populated. + * @param depends List of events to wait for before starting computations, if + * any. + * + * @return Event to wait on to ensure that computation completes. + * @defgroup CtorKernels + */ +template +sycl::event lin_space_step_impl(sycl::queue &exec_q, + std::size_t nelems, + Ty start_v, + Ty step_v, + char *array_data, + const std::vector &depends) +{ + dpctl::tensor::type_utils::validate_type_for_device(exec_q); + sycl::event lin_space_step_event = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + cgh.parallel_for>( + sycl::range<1>{nelems}, + LinearSequenceStepFunctor(array_data, start_v, step_v)); + }); + + return lin_space_step_event; +} + +// Constructor to populate tensor with linear sequence defined by +// start and data + +template +class LinearSequenceAffineFunctor +{ +private: + Ty *p = nullptr; + Ty start_v; + Ty end_v; + std::size_t n; + +public: + LinearSequenceAffineFunctor(char *dst_p, Ty v0, Ty v1, std::size_t den) + : p(reinterpret_cast(dst_p)), start_v(v0), end_v(v1), + n((den == 0) ? 1 : den) + { + } + + void operator()(sycl::id<1> wiid) const + { + auto i = wiid.get(0); + wTy wc = wTy(i) / n; + wTy w = wTy(n - i) / n; + using dpctl::tensor::type_utils::is_complex; + if constexpr (is_complex::value) { + using reT = typename Ty::value_type; + auto _w = static_cast(w); + auto _wc = static_cast(wc); + auto re_comb = sycl::fma(start_v.real(), _w, reT(0)); + re_comb = + sycl::fma(end_v.real(), _wc, + re_comb); // start_v.real() * _w + end_v.real() * _wc; + auto im_comb = + sycl::fma(start_v.imag(), _w, + reT(0)); // start_v.imag() * _w + end_v.imag() * _wc; + im_comb = sycl::fma(end_v.imag(), _wc, im_comb); + Ty affine_comb = Ty{re_comb, im_comb}; + p[i] = affine_comb; + } + else if constexpr (std::is_floating_point::value) { + Ty _w = static_cast(w); + Ty _wc = static_cast(wc); + auto affine_comb = + sycl::fma(start_v, _w, Ty(0)); // start_v * w + end_v * wc; + affine_comb = sycl::fma(end_v, _wc, affine_comb); + p[i] = affine_comb; + } + else { + using dpctl::tensor::type_utils::convert_impl; + auto affine_comb = start_v * w + end_v * wc; + p[i] = convert_impl(affine_comb); + } + } +}; + +/*! + * @brief Function to submit kernel to populate given contiguous memory + * allocation with linear sequence specified by typed starting and end values. + * + * @param exec_q Sycl queue to which kernel is submitted for execution. + * @param nelems Length of the sequence. + * @param start_v Starting value of the sequence. + * @param end_v End-value of the sequence. + * @param include_endpoint Whether the end-value is included in the sequence. + * @param array_data Kernel accessible USM pointer to the start of array to be + * populated. + * @param depends List of events to wait for before starting computations, if + * any. + * + * @return Event to wait on to ensure that computation completes. + * @defgroup CtorKernels + */ +template +sycl::event lin_space_affine_impl(sycl::queue &exec_q, + std::size_t nelems, + Ty start_v, + Ty end_v, + bool include_endpoint, + char *array_data, + const std::vector &depends) +{ + dpctl::tensor::type_utils::validate_type_for_device(exec_q); + + const bool device_supports_doubles = + exec_q.get_device().has(sycl::aspect::fp64); + const std::size_t den = (include_endpoint) ? nelems - 1 : nelems; + + sycl::event lin_space_affine_event = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + if (device_supports_doubles) { + using KernelName = linear_sequence_affine_kernel; + using Impl = LinearSequenceAffineFunctor; + + cgh.parallel_for(sycl::range<1>{nelems}, + Impl(array_data, start_v, end_v, den)); + } + else { + using KernelName = linear_sequence_affine_kernel; + using Impl = LinearSequenceAffineFunctor; + + cgh.parallel_for(sycl::range<1>{nelems}, + Impl(array_data, start_v, end_v, den)); + } + }); + + return lin_space_affine_event; +} + +/* ================ Full ================== */ + +/*! + * @brief Function to submit kernel to fill given contiguous memory allocation + * with specified value. + * + * @param exec_q Sycl queue to which kernel is submitted for execution. + * @param nelems Length of the sequence + * @param fill_v Value to fill the array with + * @param dst_p Kernel accessible USM pointer to the start of array to be + * populated. + * @param depends List of events to wait for before starting computations, if + * any. + * + * @return Event to wait on to ensure that computation completes. + * @defgroup CtorKernels + */ +template +sycl::event full_contig_impl(sycl::queue &q, + std::size_t nelems, + dstTy fill_v, + char *dst_p, + const std::vector &depends) +{ + dpctl::tensor::type_utils::validate_type_for_device(q); + sycl::event fill_ev = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + dstTy *p = reinterpret_cast(dst_p); + cgh.fill(p, fill_v, nelems); + }); + + return fill_ev; +} + +template +class FullStridedFunctor +{ +private: + Ty *p = nullptr; + Ty fill_v; + IndexerT indexer; + +public: + FullStridedFunctor(Ty *p_, const Ty &fill_v_, const IndexerT &indexer_) + : p(p_), fill_v(fill_v_), indexer(indexer_) + { + } + + void operator()(sycl::id<1> id) const + { + auto offset = indexer(id.get(0)); + p[offset] = fill_v; + } +}; + +/*! + * @brief Function to submit kernel to fill given contiguous memory allocation + * with specified value. + * + * @param exec_q Sycl queue to which kernel is submitted for execution. + * @param nd Array dimensionality + * @param nelems Length of the sequence + * @param shape_strides Kernel accessible USM pointer to packed shape and + * strides of array. + * @param fill_v Value to fill the array with + * @param dst_p Kernel accessible USM pointer to the start of array to be + * populated. + * @param depends List of events to wait for before starting computations, if + * any. + * + * @return Event to wait on to ensure that computation completes. + * @defgroup CtorKernels + */ +template +sycl::event full_strided_impl(sycl::queue &q, + int nd, + std::size_t nelems, + const ssize_t *shape_strides, + dstTy fill_v, + char *dst_p, + const std::vector &depends) +{ + dpctl::tensor::type_utils::validate_type_for_device(q); + + dstTy *dst_tp = reinterpret_cast(dst_p); + + using dpctl::tensor::offset_utils::StridedIndexer; + const StridedIndexer strided_indexer(nd, 0, shape_strides); + + sycl::event fill_ev = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + using KernelName = full_strided_kernel; + using Impl = FullStridedFunctor; + + cgh.parallel_for(sycl::range<1>{nelems}, + Impl(dst_tp, fill_v, strided_indexer)); + }); + + return fill_ev; +} + +/* ================ Eye ================== */ + +typedef sycl::event (*eye_fn_ptr_t)(sycl::queue &, + std::size_t nelems, // num_elements + ssize_t start, + ssize_t end, + ssize_t step, + char *, // dst_data_ptr + const std::vector &); + +template +class EyeFunctor +{ +private: + Ty *p = nullptr; + ssize_t start_v; + ssize_t end_v; + ssize_t step_v; + +public: + EyeFunctor(char *dst_p, + const ssize_t v0, + const ssize_t v1, + const ssize_t dv) + : p(reinterpret_cast(dst_p)), start_v(v0), end_v(v1), step_v(dv) + { + } + + void operator()(sycl::id<1> wiid) const + { + Ty set_v = 0; + ssize_t i = static_cast(wiid.get(0)); + if (i >= start_v and i <= end_v) { + if ((i - start_v) % step_v == 0) { + set_v = 1; + } + } + p[i] = set_v; + } +}; + +/*! + * @brief Function to populate 2D array with eye matrix. + * + * @param exec_q Sycl queue to which kernel is submitted for execution. + * @param nelems Number of elements to assign. + * @param start Position of the first non-zero value. + * @param end Position of the last non-zero value. + * @param step Number of array elements between non-zeros. + * @param array_data Kernel accessible USM pointer for the destination array. + * @param depends List of events to wait for before starting computations, if + * any. + * + * @return Event to wait on to ensure that computation completes. + * @defgroup CtorKernels + */ +template +sycl::event eye_impl(sycl::queue &exec_q, + std::size_t nelems, + const ssize_t start, + const ssize_t end, + const ssize_t step, + char *array_data, + const std::vector &depends) +{ + dpctl::tensor::type_utils::validate_type_for_device(exec_q); + sycl::event eye_event = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + using KernelName = eye_kernel; + using Impl = EyeFunctor; + + cgh.parallel_for(sycl::range<1>{nelems}, + Impl(array_data, start, end, step)); + }); + + return eye_event; +} + +/*! + * @brief Factory to get function pointer of type `fnT` for data type `Ty`. + * @ingroup CtorKernels + */ +template +struct EyeFactory +{ + fnT get() + { + fnT f = eye_impl; + return f; + } +}; + +/* =========================== Tril and triu ============================== */ + +// define function type +typedef sycl::event (*tri_fn_ptr_t)(sycl::queue &, + ssize_t, // inner_range //ssize_t + ssize_t, // outer_range + char *, // src_data_ptr + char *, // dst_data_ptr + ssize_t, // nd + ssize_t *, // shape_and_strides + ssize_t, // k + const std::vector &, + const std::vector &); + +/*! + * @brief Function to copy triangular matrices from source stack to destination + * stack. + * + * @param exec_q Sycl queue to which kernel is submitted for execution. + * @param inner_range Number of elements in each matrix. + * @param outer_range Number of matrices to copy. + * @param src_p Kernel accessible USM pointer for the source array. + * @param dst_p Kernel accessible USM pointer for the destination array. + * @param nd The array dimensionality of source and destination arrays. + * @param shape_and_strides Kernel accessible USM pointer to packed shape and + * strides of arrays. + * @param k Position of the diagonal above/below which to copy filling the rest + * with zero elements. + * @param depends List of events to wait for before starting computations, if + * any. + * @param additional_depends List of additional events to wait for before + * starting computations, if any. + * + * @return Event to wait on to ensure that computation completes. + * @defgroup CtorKernels + */ +template +class tri_kernel; +template +sycl::event tri_impl(sycl::queue &exec_q, + ssize_t inner_range, + ssize_t outer_range, + char *src_p, + char *dst_p, + ssize_t nd, + ssize_t *shape_and_strides, + ssize_t k, + const std::vector &depends, + const std::vector &additional_depends) +{ + static constexpr int d2 = 2; + ssize_t src_s = nd; + ssize_t dst_s = 2 * nd; + ssize_t nd_1 = nd - 1; + ssize_t nd_2 = nd - 2; + Ty *src = reinterpret_cast(src_p); + Ty *dst = reinterpret_cast(dst_p); + + dpctl::tensor::type_utils::validate_type_for_device(exec_q); + + sycl::event tri_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + cgh.depends_on(additional_depends); + + cgh.parallel_for>( + sycl::range<1>(inner_range * outer_range), [=](sycl::id<1> idx) { + ssize_t outer_gid = idx[0] / inner_range; + ssize_t inner_gid = idx[0] - inner_range * outer_gid; + + ssize_t src_inner_offset = 0, dst_inner_offset = 0; + bool to_copy{false}; + + { + using dpctl::tensor::strides::CIndexer_array; + CIndexer_array indexer_i( + {shape_and_strides[nd_2], shape_and_strides[nd_1]}); + indexer_i.set(inner_gid); + const std::array &inner = indexer_i.get(); + src_inner_offset = + inner[0] * shape_and_strides[src_s + nd_2] + + inner[1] * shape_and_strides[src_s + nd_1]; + dst_inner_offset = + inner[0] * shape_and_strides[dst_s + nd_2] + + inner[1] * shape_and_strides[dst_s + nd_1]; + + if constexpr (upper) + to_copy = (inner[0] + k >= inner[1]); + else + to_copy = (inner[0] + k <= inner[1]); + } + + ssize_t src_offset = 0; + ssize_t dst_offset = 0; + { + using dpctl::tensor::strides::CIndexer_vector; + CIndexer_vector outer(nd - d2); + outer.get_displacement( + outer_gid, shape_and_strides, shape_and_strides + src_s, + shape_and_strides + dst_s, src_offset, dst_offset); + } + + src_offset += src_inner_offset; + dst_offset += dst_inner_offset; + + dst[dst_offset] = (to_copy) ? src[src_offset] : Ty(0); + }); + }); + return tri_ev; +} + +/*! + * @brief Factory to get function pointer of type `fnT` for data type `Ty`. + * @ingroup CtorKernels + */ +template +struct TrilGenericFactory +{ + fnT get() + { + fnT f = tri_impl; + return f; + } +}; + +/*! + * @brief Factory to get function pointer of type `fnT` for data type `Ty`. + * @ingroup CtorKernels + */ +template +struct TriuGenericFactory +{ + fnT get() + { + fnT f = tri_impl; + return f; + } +}; + +} // namespace dpctl::tensor::kernels::constructors diff --git a/dpnp/tensor/libtensor/include/kernels/copy_and_cast.hpp b/dpnp/tensor/libtensor/include/kernels/copy_and_cast.hpp new file mode 100644 index 000000000000..2c4146d467e6 --- /dev/null +++ b/dpnp/tensor/libtensor/include/kernels/copy_and_cast.hpp @@ -0,0 +1,1273 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for tensor copying and value casting. +//===----------------------------------------------------------------------===// + +#pragma once +#include +#include +#include +#include +#include +#include + +#include "dpctl_tensor_types.hpp" +#include "kernels/alignment.hpp" +#include "utils/offset_utils.hpp" +#include "utils/sycl_utils.hpp" +#include "utils/type_utils.hpp" + +namespace dpctl::tensor::kernels::copy_and_cast +{ + +using dpctl::tensor::ssize_t; +using namespace dpctl::tensor::offset_utils; + +using dpctl::tensor::kernels::alignment_utils:: + disabled_sg_loadstore_wrapper_krn; +using dpctl::tensor::kernels::alignment_utils::is_aligned; +using dpctl::tensor::kernels::alignment_utils::required_alignment; + +using dpctl::tensor::sycl_utils::sub_group_load; +using dpctl::tensor::sycl_utils::sub_group_store; + +template +class copy_cast_generic_kernel; + +template +class copy_cast_contig_kernel; + +template +class copy_cast_from_host_kernel; + +template +class copy_cast_from_host_contig_kernel; + +template +class Caster +{ +public: + Caster() = default; + dstTy operator()(const srcTy &src) const + { + using dpctl::tensor::type_utils::convert_impl; + return convert_impl(src); + } +}; + +template +class GenericCopyFunctor +{ +private: + const srcT *src_ = nullptr; + dstT *dst_ = nullptr; + IndexerT indexer_; + +public: + GenericCopyFunctor(const srcT *src_p, dstT *dst_p, const IndexerT &indexer) + : src_(src_p), dst_(dst_p), indexer_(indexer) + { + } + + void operator()(sycl::id<1> wiid) const + { + const auto &offsets = indexer_(static_cast(wiid.get(0))); + const ssize_t &src_offset = offsets.get_first_offset(); + const ssize_t &dst_offset = offsets.get_second_offset(); + + static constexpr CastFnT fn{}; + dst_[dst_offset] = fn(src_[src_offset]); + } +}; + +/*! + @defgroup CopyAndCastKernels + */ + +/*! + * @brief Function pointer type for generic array cast and copying function. + */ +typedef sycl::event (*copy_and_cast_generic_fn_ptr_t)( + sycl::queue &, + std::size_t, + int, + const ssize_t *, + const char *, + ssize_t, + char *, + ssize_t, + const std::vector &, + const std::vector &); + +/*! + * @brief Generic function to copy `nelems` elements from `src` usm_ndarray to + `dst` usm_ndarray while casting from `srcTy` to `dstTy`. + + Both arrays have array dimensionality specified via argument `nd`. The + `shape_and_strides` is kernel accessible USM array of length `3*nd`, where the + first `nd` elements encode common shape, second `nd` elements contain strides + of `src` array, and the trailing `nd` elements contain strides of `dst` array. + `src_p` and `dst_p` represent pointers into respective arrays, but the start of + iteration begins at offset of `src_offset` elements for `src` array and at + offset `dst_offset` elements for `dst` array. Kernel is submitted to sycl queue + `q` with events `depends` and `additional_depends` as dependencies. + + @param q Sycl queue to which the kernel is submitted. + @param nelems Number of elements to cast and copy. + @param nd Array dimensionality, i.e. number of indices needed to + identify an element of each array. + @param shape_and_strides Kernel accessible USM pointer to packed shape and + strides. + @param src_p Kernel accessible USM pointer for the source array + @param src_offset Offset to the beginning of iteration in number of + elements of source array from `src_p`. + @param dst_p Kernel accessible USM pointer for the destination array + @param dst_offset Offset to the beginning of iteration in number of + elements of destination array from `dst_p`. + @param depends List of events to wait for before starting computations, if + any. + @param additional_depends Additional list of events to wait for before + starting computations, if any. + + @return Event to wait on to ensure that computation completes. + @ingroup CopyAndCastKernels + */ +template +sycl::event copy_and_cast_generic_impl( + sycl::queue &q, + std::size_t nelems, + int nd, + const ssize_t *shape_and_strides, + const char *src_p, + ssize_t src_offset, + char *dst_p, + ssize_t dst_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + dpctl::tensor::type_utils::validate_type_for_device(q); + dpctl::tensor::type_utils::validate_type_for_device(q); + + sycl::event copy_and_cast_ev = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + cgh.depends_on(additional_depends); + + const TwoOffsets_StridedIndexer indexer{nd, src_offset, dst_offset, + shape_and_strides}; + const srcTy *src_tp = reinterpret_cast(src_p); + dstTy *dst_tp = reinterpret_cast(dst_p); + + cgh.parallel_for>( + sycl::range<1>(nelems), + GenericCopyFunctor, + TwoOffsets_StridedIndexer>(src_tp, dst_tp, + indexer)); + }); + + return copy_and_cast_ev; +} + +/*! + * @brief Factory to get generic function pointer of type `fnT` for given source + * data type `S` and destination data type `D`. + * @ingroup CopyAndCastKernels + */ +template +struct CopyAndCastGenericFactory +{ + fnT get() + { + fnT f = copy_and_cast_generic_impl; + return f; + } +}; + +// Specialization of copy_and_cast for contiguous arrays + +template +class ContigCopyFunctor +{ +private: + std::size_t nelems; + const srcT *src_p = nullptr; + dstT *dst_p = nullptr; + +public: + ContigCopyFunctor(const std::size_t nelems_, + const srcT *src_p_, + dstT *dst_p_) + : nelems(nelems_), src_p(src_p_), dst_p(dst_p_) + { + } + + void operator()(sycl::nd_item<1> ndit) const + { + static constexpr CastFnT fn{}; + + static constexpr std::uint8_t elems_per_wi = n_vecs * vec_sz; + + using dpctl::tensor::type_utils::is_complex_v; + if constexpr (!enable_sg_loadstore || is_complex_v || + is_complex_v) { + std::uint16_t sgSize = ndit.get_sub_group().get_local_range()[0]; + const std::size_t gid = ndit.get_global_linear_id(); + + // start = (gid / sgSize) * elems_per_sg + (gid % sgSize) + const std::uint16_t elems_per_sg = sgSize * elems_per_wi; + const std::size_t start = + (gid / sgSize) * (elems_per_sg - sgSize) + gid; + const std::size_t end = std::min(nelems, start + elems_per_sg); + for (std::size_t offset = start; offset < end; offset += sgSize) { + dst_p[offset] = fn(src_p[offset]); + } + } + else { + auto sg = ndit.get_sub_group(); + const std::uint16_t sgSize = sg.get_max_local_range()[0]; + const std::size_t base = + elems_per_wi * (ndit.get_group(0) * ndit.get_local_range(0) + + sg.get_group_id()[0] * sgSize); + + if (base + elems_per_wi * sgSize < nelems) { + sycl::vec dst_vec; + +#pragma unroll + for (std::uint8_t it = 0; it < n_vecs * vec_sz; it += vec_sz) { + const std::size_t offset = base + it * sgSize; + auto src_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>(&src_p[offset]); + auto dst_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>(&dst_p[offset]); + + const sycl::vec src_vec = + sub_group_load(sg, src_multi_ptr); +#pragma unroll + for (std::uint8_t k = 0; k < vec_sz; k++) { + dst_vec[k] = fn(src_vec[k]); + } + sub_group_store(sg, dst_vec, dst_multi_ptr); + } + } + else { + const std::size_t start = base + sg.get_local_id()[0]; + for (std::size_t k = start; k < nelems; k += sgSize) { + dst_p[k] = fn(src_p[k]); + } + } + } + } +}; + +/*! + * @brief Function pointer type for contiguous array cast and copy function. + */ +typedef sycl::event (*copy_and_cast_contig_fn_ptr_t)( + sycl::queue &, + std::size_t, + const char *, + char *, + const std::vector &); + +/*! + * @brief Function to copy `nelems` elements from contiguous `src` usm_ndarray + to contiguous `dst` usm_ndarray while casting from `srcTy` to `dstTy`. + + Both arrays have the same number of elements `nelems`. + `src_cp` and `dst_cp` represent char pointers to the start of respective + arrays. Kernel is submitted to sycl queue `q` with events `depends` as + dependencies. + + @param q Sycl queue to which the kernel is submitted. + @param nelems Number of elements to cast and copy. + @param src_p Kernel accessible USM pointer for the source array + @param dst_p Kernel accessible USM pointer for the destination array + @param depends List of events to wait for before starting computations, if + any. + + @return Event to wait on to ensure that computation completes. + @ingroup CopyAndCastKernels + */ +template +sycl::event copy_and_cast_contig_impl(sycl::queue &q, + std::size_t nelems, + const char *src_cp, + char *dst_cp, + const std::vector &depends) +{ + dpctl::tensor::type_utils::validate_type_for_device(q); + dpctl::tensor::type_utils::validate_type_for_device(q); + + sycl::event copy_and_cast_ev = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + const srcTy *src_tp = reinterpret_cast(src_cp); + dstTy *dst_tp = reinterpret_cast(dst_cp); + + std::size_t lws = 64; + static constexpr std::uint32_t vec_sz = 4; + static constexpr std::uint32_t n_vecs = 2; + const std::size_t n_groups = + ((nelems + lws * n_vecs * vec_sz - 1) / (lws * n_vecs * vec_sz)); + const auto gws_range = sycl::range<1>(n_groups * lws); + const auto lws_range = sycl::range<1>(lws); + + if (is_aligned(src_cp) && + is_aligned(dst_cp)) { + static constexpr bool enable_sg_loadstore = true; + using KernelName = + copy_cast_contig_kernel; + + cgh.parallel_for( + sycl::nd_range<1>(gws_range, lws_range), + ContigCopyFunctor, vec_sz, + n_vecs, enable_sg_loadstore>(nelems, src_tp, + dst_tp)); + } + else { + static constexpr bool disable_sg_loadstore = false; + using InnerKernelName = + copy_cast_contig_kernel; + using KernelName = + disabled_sg_loadstore_wrapper_krn; + + cgh.parallel_for( + sycl::nd_range<1>(gws_range, lws_range), + ContigCopyFunctor, vec_sz, + n_vecs, disable_sg_loadstore>(nelems, src_tp, + dst_tp)); + } + }); + + return copy_and_cast_ev; +} + +/*! + * @brief Factory to get specialized function pointer for casting and copying + * contiguous arrays. + * @ingroup CopyAndCastKernels + */ +template +struct CopyAndCastContigFactory +{ + fnT get() + { + fnT f = copy_and_cast_contig_impl; + return f; + } +}; + +// Specialization of copy_and_cast for 1D arrays + +/*! + * @brief Factory to get function pointer for casting and copying 1D arrays. + * @ingroup CopyAndCastKernels + */ +typedef sycl::event (*copy_and_cast_1d_fn_ptr_t)( + sycl::queue &, + std::size_t, + const std::array &, + const std::array &, + const std::array &, + const char *, + ssize_t, + char *, + ssize_t, + const std::vector &); + +/*! + * @brief Factory to get function pointer for casting and copying 2D arrays. + * @ingroup CopyAndCastKernels + */ +typedef sycl::event (*copy_and_cast_2d_fn_ptr_t)( + sycl::queue &, + std::size_t, + const std::array &, + const std::array &, + const std::array &, + const char *, + ssize_t, + char *, + ssize_t, + const std::vector &); + +/*! + * @brief Specialized for given array dimension function to copy `nelems` + elements from `src` usm_ndarray to `dst` usm_ndarray while casting from `srcTy` + to `dstTy`. + + Both arrays have array dimensionality known at compile time and specified in + template parameters `nd`. Arrays' shape and strides are provided as + `std::array`. `src_p` and `dst_p` represent pointers into respective arrays, + but the start of iteration begins at offset of `src_offset` elements for `src` + array and at offset `dst_offset` elements for `dst` array. Kernel is submitted + to sycl queue `q` with events `depends` as dependencies. + + @param q The queue where the routine should be executed. + @param nelems Number of elements to cast and copy. + @param shape Common shape of the arrays. + @param src_strides Strides of the source array. + @param dst_strides Strides of the destination array. + @param src_p Kernel accessible USM pointer for the source array + @param src_offset Offset to the beginning of iteration in number of elements + of the source array from `src_p`. + @param dst_p Kernel accessible USM pointer for the destination array + @param dst_offset Offset to the beginning of iteration in number of elements + of the destination array from `src_p`. + @param depends List of events to wait for before starting computations, if + any. + + @return Event to wait on to ensure that computation completes. + * @ingroup CopyAndCastKernels + */ +template +sycl::event copy_and_cast_nd_specialized_impl( + sycl::queue &q, + std::size_t nelems, + const std::array &shape, + const std::array &src_strides, + const std::array &dst_strides, + const char *src_p, + ssize_t src_offset, + char *dst_p, + ssize_t dst_offset, + const std::vector &depends) +{ + dpctl::tensor::type_utils::validate_type_for_device(q); + dpctl::tensor::type_utils::validate_type_for_device(q); + + sycl::event copy_and_cast_ev = q.submit([&](sycl::handler &cgh) { + using IndexerT = TwoOffsets_FixedDimStridedIndexer; + const IndexerT indexer{shape, src_strides, dst_strides, src_offset, + dst_offset}; + const srcTy *src_tp = reinterpret_cast(src_p); + dstTy *dst_tp = reinterpret_cast(dst_p); + + cgh.depends_on(depends); + cgh.parallel_for< + class copy_cast_generic_kernel>( + sycl::range<1>(nelems), + GenericCopyFunctor, IndexerT>( + src_tp, dst_tp, indexer)); + }); + + return copy_and_cast_ev; +} + +/*! + * @brief Factory to get 1D-specialized function pointer of type `fnT` for given + * source data type `S` and destination data type `D`. + * @ingroup CopyAndCastKernels + */ +template +struct CopyAndCast1DFactory +{ + fnT get() + { + fnT f = copy_and_cast_nd_specialized_impl; + return f; + } +}; + +/*! + * @brief Factory to get 2D-specialized function pointer of type `fnT` for given + * source data type `S` and destination data type `D`. + * @ingroup CopyAndCastKernels + */ +template +struct CopyAndCast2DFactory +{ + fnT get() + { + fnT f = copy_and_cast_nd_specialized_impl; + return f; + } +}; + +// ====================== Copying from host to USM + +template +class GenericCopyFromHostFunctor +{ +private: + AccessorT src_acc_; + dstTy *dst_ = nullptr; + IndexerT indexer_; + +public: + GenericCopyFromHostFunctor(const AccessorT &src_acc, + dstTy *dst_p, + const IndexerT &indexer) + : src_acc_(src_acc), dst_(dst_p), indexer_(indexer) + { + } + + void operator()(sycl::id<1> wiid) const + { + const auto &offsets = indexer_(static_cast(wiid.get(0))); + const ssize_t &src_offset = offsets.get_first_offset(); + const ssize_t &dst_offset = offsets.get_second_offset(); + + CastFnT fn{}; + dst_[dst_offset] = fn(src_acc_[src_offset]); + } +}; + +typedef void (*copy_and_cast_from_host_blocking_fn_ptr_t)( + sycl::queue &, + std::size_t, + int, + const ssize_t *, + const char *, + ssize_t, + ssize_t, + ssize_t, + char *, + ssize_t, + const std::vector &, + const std::vector &); + +/*! + * @brief Function to copy from NumPy's ndarray with elements of type `srcTy` + * into usm_ndarray with elements of type `srcTy`. + * + * Function to cast and copy elements from numpy.ndarray specified by typeless + * `host_src_p` and the `src_offset` given in the number of array elements. + * Arrays' metadata are given in packed USM vector of length `3*nd` whose first + * `nd` elements contain arrays' shape, next `nd` elements specify source + * strides in elements (not bytes), and trailing `nd` elements specify + * destination array strides. Kernel dependencies are given by two vectors of + * events: `depends` and `additional_depends`. The function execution is + * complete at the return. + * + * @param q The queue where the routine should be executed. + * @param nelems Number of elements to cast and copy. + * @param nd The dimensionality of arrays + * @param shape_and_strides Kernel accessible USM pointer to packed shape and + * strides. + * @param host_src_p Host (not USM allocated) pointer associated with the + * source array. + * @param src_offset Offset to the beginning of iteration in number of elements + * of the source array from `host_src_p`. + * @param src_min_nelem_offset Smallest value of offset relative to + * `host_src_p` in number of elements attained while iterating over elements of + * the source array. + * @param src_max_nelem_offset Largest value of offset relative to `host_src_p` + * in number of elements attained while iterating over elements of the source + * array. + * @param dst_p USM pointer associated with the destination array. + * @param dst_offset Offset to the beginning of iteration in number of elements + * of the destination array from `dst_p`. + * @param depends List of events to wait for before starting computations, if + * any. + * @param additional_depends List of additional events to wait for before + * starting computations, if any. + * + * @ingroup CopyAndCastKernels + */ +template +void copy_and_cast_from_host_impl( + sycl::queue &q, + std::size_t nelems, + int nd, + const ssize_t *shape_and_strides, + const char *host_src_p, + ssize_t src_offset, + ssize_t src_min_nelem_offset, + ssize_t src_max_nelem_offset, + char *dst_p, + ssize_t dst_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + ssize_t nelems_range = src_max_nelem_offset - src_min_nelem_offset + 1; + + dpctl::tensor::type_utils::validate_type_for_device(q); + dpctl::tensor::type_utils::validate_type_for_device(q); + + sycl::buffer npy_buf( + reinterpret_cast(host_src_p) + src_min_nelem_offset, + sycl::range<1>(nelems_range), {sycl::property::buffer::use_host_ptr{}}); + + sycl::event copy_and_cast_from_host_ev = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + cgh.depends_on(additional_depends); + + sycl::accessor npy_acc(npy_buf, cgh, sycl::read_only); + + const TwoOffsets_StridedIndexer indexer{ + nd, src_offset - src_min_nelem_offset, dst_offset, + const_cast(shape_and_strides)}; + + dstTy *dst_tp = reinterpret_cast(dst_p); + + cgh.parallel_for>( + sycl::range<1>(nelems), + GenericCopyFromHostFunctor, + TwoOffsets_StridedIndexer>( + npy_acc, dst_tp, indexer)); + }); + + // perform explicit synchronization. Implicit synchronization would be + // performed by sycl::buffer destructor. + copy_and_cast_from_host_ev.wait(); + + return; +} + +/*! + * @brief Factory to get function pointer of type `fnT` for given NumPy array + * source data type `S` and destination data type `D`. + * @defgroup CopyAndCastKernels + */ +template +struct CopyAndCastFromHostFactory +{ + fnT get() + { + fnT f = copy_and_cast_from_host_impl; + return f; + } +}; + +typedef void (*copy_and_cast_from_host_contig_blocking_fn_ptr_t)( + sycl::queue &, + std::size_t, /* nelems */ + const char *, /* src_pointer */ + ssize_t, /* src_offset */ + char *, /* dst_pointer */ + ssize_t, /* dst_offset */ + const std::vector &); + +/*! + * @brief Function to copy from NumPy's ndarray with elements of type `srcTy` + * into usm_ndarray with elements of type `srcTy` for contiguous arrays. + * + * Function to cast and copy elements from numpy.ndarray specified by typeless + * `host_src_p` and the `src_offset` given in the number of array elements. + * Kernel dependencies are given by two vectors of + * events: `depends` and `additional_depends`. The function execution is + * complete at the return. + * + * @param q The queue where the routine should be executed. + * @param nelems Number of elements to cast and copy. + * @param src_stride The stride of source array in elements + * @param dst_stride The stride of destimation array in elements + * @param host_src_p Host (not USM allocated) pointer associated with the + * source array. + * @param src_offset Offset to the beginning of iteration in number of elements + * of the source array from `host_src_p`. + * @param dst_p USM pointer associated with the destination array. + * @param dst_offset Offset to the beginning of iteration in number of elements + * of the destination array from `dst_p`. + * @param depends List of events to wait for before starting computations, if + * any. + * + * @ingroup CopyAndCastKernels + */ +template +void copy_and_cast_from_host_contig_impl( + sycl::queue &q, + std::size_t nelems, + const char *host_src_p, + ssize_t src_offset, + char *dst_p, + ssize_t dst_offset, + const std::vector &depends) +{ + dpctl::tensor::type_utils::validate_type_for_device(q); + dpctl::tensor::type_utils::validate_type_for_device(q); + + sycl::buffer npy_buf( + reinterpret_cast(host_src_p) + src_offset, + sycl::range<1>(nelems), {sycl::property::buffer::use_host_ptr{}}); + + sycl::event copy_and_cast_from_host_ev = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + sycl::accessor npy_acc(npy_buf, cgh, sycl::read_only); + + using IndexerT = TwoOffsets_CombinedIndexer; + static constexpr NoOpIndexer src_indexer{}; + static constexpr NoOpIndexer dst_indexer{}; + static constexpr TwoOffsets_CombinedIndexer indexer{src_indexer, + dst_indexer}; + + dstTy *dst_tp = reinterpret_cast(dst_p) + dst_offset; + + cgh.parallel_for< + copy_cast_from_host_contig_kernel>( + sycl::range<1>(nelems), + GenericCopyFromHostFunctor, IndexerT>( + npy_acc, dst_tp, indexer)); + }); + + // perform explicit synchronization. Implicit synchronization would be + // performed by sycl::buffer destructor. + copy_and_cast_from_host_ev.wait(); + + return; +} + +/*! + * @brief Factory to get function pointer of type `fnT` for given NumPy array + * source data type `S` and destination data type `D`. + * @defgroup CopyAndCastKernels + */ +template +struct CopyAndCastFromHostContigFactory +{ + fnT get() + { + fnT f = copy_and_cast_from_host_contig_impl; + return f; + } +}; + +// =============== Copying for reshape ================== // + +template +class copy_for_reshape_generic_kernel; + +template +class GenericCopyForReshapeFunctor +{ +private: + const Ty *src_p = nullptr; + Ty *dst_p = nullptr; + SrcIndexerT src_indexer_; + DstIndexerT dst_indexer_; + +public: + GenericCopyForReshapeFunctor(const char *src_ptr, + char *dst_ptr, + const SrcIndexerT &src_indexer, + const DstIndexerT &dst_indexer) + : src_p(reinterpret_cast(src_ptr)), + dst_p(reinterpret_cast(dst_ptr)), src_indexer_(src_indexer), + dst_indexer_(dst_indexer) + { + } + + void operator()(sycl::id<1> wiid) const + { + const ssize_t src_offset = src_indexer_(wiid.get(0)); + const ssize_t dst_offset = dst_indexer_(wiid.get(0)); + + dst_p[dst_offset] = src_p[src_offset]; + } +}; + +// define function type +typedef sycl::event (*copy_for_reshape_fn_ptr_t)( + sycl::queue &, + std::size_t, // num_elements + int, // src_nd + int, // dst_nd + const ssize_t *, // packed shapes and strides + const char *, // src_data_ptr + char *, // dst_data_ptr + const std::vector &); + +/*! + * @brief Function to copy content of array while reshaping. + * + * Submits a kernel to perform a copy `dst[unravel_index(i, + * dst.shape)] = src[unravel_undex(i, src.shape)]`. + * + * @param q The execution queue where kernel is submitted. + * @param nelems The number of elements to copy + * @param src_nd Array dimension of the source array + * @param dst_nd Array dimension of the destination array + * @param packed_shapes_and_strides Kernel accessible USM array of size + * `2*src_nd + 2*dst_nd` with content `[src_shape, src_strides, dst_shape, + * dst_strides]`. + * @param src_p Typeless USM pointer to the buffer of the source array + * @param dst_p Typeless USM pointer to the buffer of the destination array + * @param depends List of events to wait for before starting computations, if + * any. + * + * @return Event to wait on to ensure that computation completes. + * @ingroup CopyAndCastKernels + */ +template +sycl::event + copy_for_reshape_generic_impl(sycl::queue &q, + std::size_t nelems, + int src_nd, + int dst_nd, + const ssize_t *packed_shapes_and_strides, + const char *src_p, + char *dst_p, + const std::vector &depends) +{ + dpctl::tensor::type_utils::validate_type_for_device(q); + + sycl::event copy_for_reshape_ev = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + // packed_shapes_and_strides: + // USM array of size 2*(src_nd + dst_nd) + // [ src_shape; src_strides; dst_shape; dst_strides ] + + const ssize_t *src_shape_and_strides = + const_cast(packed_shapes_and_strides); + + const ssize_t *dst_shape_and_strides = const_cast( + packed_shapes_and_strides + (2 * src_nd)); + + const StridedIndexer src_indexer{src_nd, 0, src_shape_and_strides}; + const StridedIndexer dst_indexer{dst_nd, 0, dst_shape_and_strides}; + + using KernelName = + copy_for_reshape_generic_kernel; + + cgh.parallel_for( + sycl::range<1>(nelems), + GenericCopyForReshapeFunctor( + src_p, dst_p, src_indexer, dst_indexer)); + }); + + return copy_for_reshape_ev; +} + +/*! + * @brief Factory to get function pointer of type `fnT` for given array data + * type `Ty`. + * @ingroup CopyAndCastKernels + */ +template +struct CopyForReshapeGenericFactory +{ + fnT get() + { + fnT f = copy_for_reshape_generic_impl; + return f; + } +}; + +// ================== Copying for roll ================== // + +/*! @brief Functor to cyclically roll global_id to the left */ +struct LeftRolled1DTransformer +{ + LeftRolled1DTransformer(std::size_t offset, std::size_t size) + : offset_(offset), size_(size) + { + } + + std::size_t operator()(std::size_t gid) const + { + const std::size_t shifted_gid = + ((gid < offset_) ? gid + size_ - offset_ : gid - offset_); + return shifted_gid; + } + +private: + std::size_t offset_ = 0; + std::size_t size_ = 1; +}; + +/*! @brief Indexer functor to compose indexer and transformer */ +template +struct CompositionIndexer +{ + CompositionIndexer(IndexerT f, TransformerT t) : f_(f), t_(t) {} + + auto operator()(std::size_t gid) const { return f_(t_(gid)); } + +private: + IndexerT f_; + TransformerT t_; +}; + +/*! @brief Indexer functor to find offset for nd-shifted indices lifted from + * iteration id */ +struct RolledNDIndexer +{ + RolledNDIndexer(int nd, + const ssize_t *shape, + const ssize_t *strides, + const ssize_t *ndshifts, + ssize_t starting_offset) + : nd_(nd), shape_(shape), strides_(strides), ndshifts_(ndshifts), + starting_offset_(starting_offset) + { + } + + ssize_t operator()(std::size_t gid) const { return compute_offset(gid); } + +private: + int nd_ = -1; + const ssize_t *shape_ = nullptr; + const ssize_t *strides_ = nullptr; + const ssize_t *ndshifts_ = nullptr; + ssize_t starting_offset_ = 0; + + ssize_t compute_offset(ssize_t gid) const + { + using dpctl::tensor::strides::CIndexer_vector; + + CIndexer_vector _ind(nd_); + ssize_t relative_offset_(0); + _ind.get_left_rolled_displacement( + gid, + shape_, // shape ptr + strides_, // strides ptr + ndshifts_, // shifts ptr + relative_offset_); + return starting_offset_ + relative_offset_; + } +}; + +template +class copy_for_roll_strided_kernel; + +template +class StridedCopyForRollFunctor +{ +private: + const Ty *src_p = nullptr; + Ty *dst_p = nullptr; + SrcIndexerT src_indexer_; + DstIndexerT dst_indexer_; + +public: + StridedCopyForRollFunctor(const Ty *src_ptr, + Ty *dst_ptr, + const SrcIndexerT &src_indexer, + const DstIndexerT &dst_indexer) + : src_p(src_ptr), dst_p(dst_ptr), src_indexer_(src_indexer), + dst_indexer_(dst_indexer) + { + } + + void operator()(sycl::id<1> wiid) const + { + const std::size_t gid = wiid.get(0); + + const ssize_t src_offset = src_indexer_(gid); + const ssize_t dst_offset = dst_indexer_(gid); + + dst_p[dst_offset] = src_p[src_offset]; + } +}; + +// define function type +typedef sycl::event (*copy_for_roll_strided_fn_ptr_t)( + sycl::queue &, + std::size_t, // shift + std::size_t, // num_elements + int, // common_nd + const ssize_t *, // packed shapes and strides + const char *, // src_data_ptr + ssize_t, // src_offset + char *, // dst_data_ptr + ssize_t, // dst_offset + const std::vector &); + +/*! + * @brief Function to copy content of array with a shift. + * + * Submits a kernel to perform a copy `dst[unravel_index((i + shift) % nelems , + * dst.shape)] = src[unravel_undex(i, src.shape)]`. + * + * @param q The execution queue where kernel is submitted. + * @param shift The shift in flat indexing, must be non-negative. + * @param nelems The number of elements to copy + * @param nd Array dimensionality of the destination and source arrays + * @param packed_shapes_and_strides Kernel accessible USM array + * of size `3*nd` with content `[common_shape, src_strides, dst_strides]`. + * @param src_p Typeless USM pointer to the buffer of the source array + * @param src_offset Displacement of first element of src relative src_p in + * elements + * @param dst_p Typeless USM pointer to the buffer of the destination array + * @param dst_offset Displacement of first element of dst relative dst_p in + * elements + * @param depends List of events to wait for before starting computations, if + * any. + * + * @return Event to wait on to ensure that computation completes. + * @ingroup CopyAndCastKernels + */ +template +sycl::event copy_for_roll_strided_impl(sycl::queue &q, + std::size_t shift, + std::size_t nelems, + int nd, + const ssize_t *packed_shapes_and_strides, + const char *src_p, + ssize_t src_offset, + char *dst_p, + ssize_t dst_offset, + const std::vector &depends) +{ + dpctl::tensor::type_utils::validate_type_for_device(q); + + sycl::event copy_for_roll_ev = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + // packed_shapes_and_strides: + // USM array of size 3 * nd + // [ common_shape; src_strides; dst_strides ] + + const StridedIndexer src_indexer{nd, src_offset, + packed_shapes_and_strides}; + const LeftRolled1DTransformer left_roll_transformer{shift, nelems}; + + using CompositeIndexerT = + CompositionIndexer; + + const CompositeIndexerT rolled_src_indexer(src_indexer, + left_roll_transformer); + + UnpackedStridedIndexer dst_indexer{nd, dst_offset, + packed_shapes_and_strides, + packed_shapes_and_strides + 2 * nd}; + + using KernelName = copy_for_roll_strided_kernel; + + const Ty *src_tp = reinterpret_cast(src_p); + Ty *dst_tp = reinterpret_cast(dst_p); + + cgh.parallel_for( + sycl::range<1>(nelems), + StridedCopyForRollFunctor( + src_tp, dst_tp, rolled_src_indexer, dst_indexer)); + }); + + return copy_for_roll_ev; +} + +// define function type +typedef sycl::event (*copy_for_roll_contig_fn_ptr_t)( + sycl::queue &, + std::size_t, // shift + std::size_t, // num_elements + const char *, // src_data_ptr + ssize_t, // src_offset + char *, // dst_data_ptr + ssize_t, // dst_offset + const std::vector &); + +template +class copy_for_roll_contig_kernel; + +/*! + * @brief Function to copy content of array with a shift. + * + * Submits a kernel to perform a copy `dst[unravel_index((i + shift) % nelems , + * dst.shape)] = src[unravel_undex(i, src.shape)]`. + * + * @param q The execution queue where kernel is submitted. + * @param shift The shift in flat indexing, must be non-negative. + * @param nelems The number of elements to copy + * @param src_p Typeless USM pointer to the buffer of the source array + * @param src_offset Displacement of the start of array src relative src_p in + * elements + * @param dst_p Typeless USM pointer to the buffer of the destination array + * @param dst_offset Displacement of the start of array dst relative dst_p in + * elements + * @param depends List of events to wait for before starting computations, if + * any. + * + * @return Event to wait on to ensure that computation completes. + * @ingroup CopyAndCastKernels + */ +template +sycl::event copy_for_roll_contig_impl(sycl::queue &q, + std::size_t shift, + std::size_t nelems, + const char *src_p, + ssize_t src_offset, + char *dst_p, + ssize_t dst_offset, + const std::vector &depends) +{ + dpctl::tensor::type_utils::validate_type_for_device(q); + + sycl::event copy_for_roll_ev = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + static constexpr NoOpIndexer src_indexer{}; + const LeftRolled1DTransformer roller{shift, nelems}; + + const CompositionIndexer + left_rolled_src_indexer{src_indexer, roller}; + static constexpr NoOpIndexer dst_indexer{}; + + using KernelName = copy_for_roll_contig_kernel; + + const Ty *src_tp = reinterpret_cast(src_p) + src_offset; + Ty *dst_tp = reinterpret_cast(dst_p) + dst_offset; + + cgh.parallel_for( + sycl::range<1>(nelems), + StridedCopyForRollFunctor< + Ty, CompositionIndexer, + NoOpIndexer>(src_tp, dst_tp, left_rolled_src_indexer, + dst_indexer)); + }); + + return copy_for_roll_ev; +} + +/*! + * @brief Factory to get function pointer of type `fnT` for given array data + * type `Ty`. + * @ingroup CopyAndCastKernels + */ +template +struct CopyForRollStridedFactory +{ + fnT get() + { + fnT f = copy_for_roll_strided_impl; + return f; + } +}; + +/*! + * @brief Factory to get function pointer of type `fnT` for given array data + * type `Ty`. + * @ingroup CopyAndCastKernels + */ +template +struct CopyForRollContigFactory +{ + fnT get() + { + fnT f = copy_for_roll_contig_impl; + return f; + } +}; + +template +class copy_for_roll_ndshift_strided_kernel; + +// define function type +typedef sycl::event (*copy_for_roll_ndshift_strided_fn_ptr_t)( + sycl::queue &, + std::size_t, // num_elements + int, // common_nd + const ssize_t *, // packed shape, strides, shifts + const char *, // src_data_ptr + ssize_t, // src_offset + char *, // dst_data_ptr + ssize_t, // dst_offset + const std::vector &); + +template +sycl::event copy_for_roll_ndshift_strided_impl( + sycl::queue &q, + std::size_t nelems, + int nd, + const ssize_t *packed_shapes_and_strides_and_shifts, + const char *src_p, + ssize_t src_offset, + char *dst_p, + ssize_t dst_offset, + const std::vector &depends) +{ + dpctl::tensor::type_utils::validate_type_for_device(q); + + sycl::event copy_for_roll_ev = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + // packed_shapes_and_strides_and_shifts: + // USM array of size 4 * nd + // [ common_shape; src_strides; dst_strides; shifts ] + + const ssize_t *shape_ptr = packed_shapes_and_strides_and_shifts; + const ssize_t *src_strides_ptr = + packed_shapes_and_strides_and_shifts + nd; + const ssize_t *dst_strides_ptr = + packed_shapes_and_strides_and_shifts + 2 * nd; + const ssize_t *shifts_ptr = + packed_shapes_and_strides_and_shifts + 3 * nd; + + const RolledNDIndexer src_indexer{nd, shape_ptr, src_strides_ptr, + shifts_ptr, src_offset}; + + const UnpackedStridedIndexer dst_indexer{nd, dst_offset, shape_ptr, + dst_strides_ptr}; + + using KernelName = copy_for_roll_strided_kernel; + + const Ty *src_tp = reinterpret_cast(src_p); + Ty *dst_tp = reinterpret_cast(dst_p); + + cgh.parallel_for( + sycl::range<1>(nelems), + StridedCopyForRollFunctor( + src_tp, dst_tp, src_indexer, dst_indexer)); + }); + + return copy_for_roll_ev; +} + +/*! + * @brief Factory to get function pointer of type `fnT` for given array data + * type `Ty`. + * @ingroup CopyAndCastKernels + */ +template +struct CopyForRollNDShiftFactory +{ + fnT get() + { + fnT f = copy_for_roll_ndshift_strided_impl; + return f; + } +}; + +} // namespace dpctl::tensor::kernels::copy_and_cast diff --git a/dpnp/tensor/libtensor/include/kernels/copy_as_contiguous.hpp b/dpnp/tensor/libtensor/include/kernels/copy_as_contiguous.hpp new file mode 100644 index 000000000000..a723f6334e7e --- /dev/null +++ b/dpnp/tensor/libtensor/include/kernels/copy_as_contiguous.hpp @@ -0,0 +1,636 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for tensor copying and value casting. +//===----------------------------------------------------------------------===// + +#pragma once +#include +#include +#include +#include +#include +#include + +#include "dpctl_tensor_types.hpp" +#include "kernels/alignment.hpp" +#include "utils/offset_utils.hpp" +#include "utils/sycl_utils.hpp" +#include "utils/type_utils.hpp" + +namespace dpctl::tensor::kernels::copy_as_contig +{ + +using dpctl::tensor::ssize_t; +using dpctl::tensor::sycl_utils::sub_group_store; + +template +class CopyAsCContigFunctor +{ +private: + std::size_t nelems; + const T *src_p = nullptr; + T *dst_p = nullptr; + IndexerT src_indexer; + +public: + CopyAsCContigFunctor(std::size_t n, + const T *src_, + T *dst_, + const IndexerT &src_indexer_) + : nelems(n), src_p(src_), dst_p(dst_), src_indexer(src_indexer_) + { + } + + void operator()(sycl::nd_item<1> ndit) const + { + static_assert(vec_sz > 0); + static_assert(n_vecs > 0); + + static constexpr std::uint8_t elems_per_wi = vec_sz * n_vecs; + + using dpctl::tensor::type_utils::is_complex; + if constexpr (!enable_sg_loadstore || is_complex::value) { + const std::uint16_t sgSize = + ndit.get_sub_group().get_max_local_range()[0]; + const std::size_t gid = ndit.get_global_linear_id(); + + // start = (gid / sgSize) * sgSize * elems_per_wi + (gid % sgSize) + // gid % sgSize == gid - (gid / sgSize) * sgSize + const std::uint16_t elems_per_sg = sgSize * elems_per_wi; + const std::size_t start = + (gid / sgSize) * (elems_per_sg - sgSize) + gid; + const std::size_t end = std::min(nelems, start + elems_per_sg); + + for (std::size_t offset = start; offset < end; offset += sgSize) { + auto src_offset = src_indexer(offset); + dst_p[offset] = src_p[src_offset]; + } + } + else { + auto sg = ndit.get_sub_group(); + const std::uint16_t sgSize = sg.get_max_local_range()[0]; + const std::size_t base = + elems_per_wi * (ndit.get_group(0) * ndit.get_local_range(0) + + sg.get_group_id()[0] * sgSize); + const std::uint16_t elems_per_sg = elems_per_wi * sgSize; + + if (base + elems_per_sg < nelems) { +#pragma unroll + for (std::uint8_t it = 0; it < elems_per_wi; it += vec_sz) { + // it == vec_id * vec_sz, for 0 <= vec_id < n_vecs + const std::size_t block_start_id = base + it * sgSize; + auto dst_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>(&dst_p[block_start_id]); + + const std::size_t elem_id0 = + block_start_id + sg.get_local_id(); + sycl::vec dst_vec; +#pragma unroll + for (std::uint8_t k = 0; k < vec_sz; ++k) { + const std::size_t elem_id = elem_id0 + k * sgSize; + const ssize_t src_offset = src_indexer(elem_id); + dst_vec[k] = src_p[src_offset]; + } + sub_group_store(sg, dst_vec, dst_multi_ptr); + } + } + else { + const std::size_t lane_id = sg.get_local_id()[0]; + const std::size_t k0 = base + lane_id; + for (std::size_t k = k0; k < nelems; k += sgSize) { + const ssize_t src_offset = src_indexer(k); + dst_p[k] = src_p[src_offset]; + } + } + } + } +}; + +template +sycl::event submit_c_contiguous_copy(sycl::queue &exec_q, + std::size_t nelems, + const T *src, + T *dst, + const IndexerT &src_indexer, + const std::vector &depends) +{ + static_assert(vec_sz > 0); + static_assert(n_vecs > 0); + + static constexpr std::size_t preferred_lws = 256; + + const auto &kernel_id = sycl::get_kernel_id(); + + auto const &ctx = exec_q.get_context(); + auto const &dev = exec_q.get_device(); + auto kb = sycl::get_kernel_bundle( + ctx, {dev}, {kernel_id}); + + auto krn = kb.get_kernel(kernel_id); + + const std::uint32_t max_sg_size = krn.template get_info< + sycl::info::kernel_device_specific::max_sub_group_size>(dev); + + const std::size_t lws = + ((preferred_lws + max_sg_size - 1) / max_sg_size) * max_sg_size; + + static constexpr std::uint8_t nelems_per_wi = n_vecs * vec_sz; + + const std::size_t nelems_per_group = nelems_per_wi * lws; + const std::size_t n_groups = + (nelems + nelems_per_group - 1) / (nelems_per_group); + + sycl::event copy_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + cgh.use_kernel_bundle(kb); + + const sycl::range<1> gRange{n_groups * lws}; + const sycl::range<1> lRange{lws}; + + cgh.parallel_for( + sycl::nd_range<1>(gRange, lRange), + CopyAsCContigFunctor( + nelems, src, dst, src_indexer)); + }); + return copy_ev; +} + +template +class as_contig_krn; + +template +sycl::event + as_c_contiguous_array_generic_impl(sycl::queue &exec_q, + std::size_t nelems, + int nd, + const ssize_t *shape_and_strides, + const char *src_p, + char *dst_p, + const std::vector &depends) +{ + dpctl::tensor::type_utils::validate_type_for_device(exec_q); + + const T *src_tp = reinterpret_cast(src_p); + T *dst_tp = reinterpret_cast(dst_p); + + using IndexerT = dpctl::tensor::offset_utils::StridedIndexer; + const IndexerT src_indexer(nd, ssize_t(0), shape_and_strides); + + static constexpr std::uint8_t vec_sz = 4u; + static constexpr std::uint8_t n_vecs = 2u; + + using dpctl::tensor::kernels::alignment_utils:: + disabled_sg_loadstore_wrapper_krn; + using dpctl::tensor::kernels::alignment_utils::is_aligned; + using dpctl::tensor::kernels::alignment_utils::required_alignment; + + sycl::event copy_ev; + if (is_aligned(dst_p)) { + static constexpr bool enable_sg_load = true; + using KernelName = + as_contig_krn; + copy_ev = submit_c_contiguous_copy( + exec_q, nelems, src_tp, dst_tp, src_indexer, depends); + } + else { + static constexpr bool disable_sg_load = false; + using InnerKernelName = + as_contig_krn; + using KernelName = disabled_sg_loadstore_wrapper_krn; + copy_ev = submit_c_contiguous_copy( + exec_q, nelems, src_tp, dst_tp, src_indexer, depends); + } + + return copy_ev; +} + +typedef sycl::event (*as_c_contiguous_array_impl_fn_ptr_t)( + sycl::queue &, + std::size_t, + int, + const ssize_t *, + const char *, + char *, + const std::vector &); + +template +struct AsCContigFactory +{ + fnT get() { return as_c_contiguous_array_generic_impl; } +}; + +template +class as_contig_batch_of_square_matrices_krn; + +namespace detail +{ +/*! @brief batch of matrices (n, n), source strides (1, src_ld), destination + strides (dst_ld, 1) src and destination arrays must be disjoint memory blocks + to avoid race condition + */ +template +sycl::event as_c_contiguous_batch_of_square_matrices_impl( + sycl::queue &exec_q, + std::size_t batch_nelems, + const BatchIndexerT &batch_two_offsets_indexer, + std::size_t n, + const char *src_p, + ssize_t src_ld, + char *dst_p, + ssize_t dst_ld, + const std::vector &depends) +{ + dpctl::tensor::type_utils::validate_type_for_device(exec_q); + + const T *src_tp = reinterpret_cast(src_p); + T *dst_tp = reinterpret_cast(dst_p); + + static constexpr std::uint16_t private_tile_size = 4; + static constexpr std::uint16_t n_lines = 2; + static constexpr std::uint16_t block_size = + n_lines * private_tile_size * private_tile_size; + + static constexpr std::uint16_t lws0 = block_size; + static constexpr std::uint16_t lws1 = n_lines; + static constexpr std::uint16_t nelems_per_wi = (block_size / lws1); + + static_assert(nelems_per_wi * lws1 == block_size); + static_assert(nelems_per_wi == private_tile_size * private_tile_size); + + static constexpr std::uint32_t lws = lws0 * lws1; + + const std::size_t n_tiles = (n + block_size - 1) / block_size; + + const ssize_t src_stride = src_ld; + const ssize_t dst_stride = dst_ld; + + sycl::range<1> lRange{lws}; + sycl::range<1> gRange{batch_nelems * n_tiles * n_tiles * lws}; + + sycl::nd_range<1> ndRange{gRange, lRange}; + + using KernelName = + as_contig_batch_of_square_matrices_krn; + + sycl::event e = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + sycl::local_accessor local_block(block_size * block_size, cgh); + + cgh.parallel_for(ndRange, [=](sycl::nd_item<1> nd_it) { + // 1. Read block from source array into SLM + const std::uint32_t lid_lin = nd_it.get_local_linear_id(); + const std::size_t gr_id_lin = nd_it.get_group_linear_id(); + + const std::size_t batch_id = gr_id_lin / (n_tiles * n_tiles); + const std::size_t rem = gr_id_lin - batch_id * (n_tiles * n_tiles); + + const auto &batch_two_offsets = batch_two_offsets_indexer(batch_id); + const auto &src_batch_offset = batch_two_offsets.get_first_offset(); + const auto &dst_batch_offset = + batch_two_offsets.get_second_offset(); + + // Block id + /* 0 <= src_gr_i1 < n_groups_n1 */ + const std::size_t src_tile_i1 = rem / n_tiles; + /* 0 <= src_gr_i0 < n_groups_n0 */ + const std::size_t src_tile_i0 = rem - src_tile_i1 * n_tiles; + + // ID of element within the block + /* 0 <= src_i1 < lws1 */ + const std::uint32_t src_i1 = lid_lin / lws0; + /* 0 <= src_i0 < lws0 */ + const std::uint32_t src_i0 = lid_lin - src_i1 * lws0; + + // Matrix element ID + const std::size_t src_tile_start0 = src_tile_i0 * block_size; + const std::size_t src_tile_start1 = src_tile_i1 * block_size; + const std::size_t src_gid0 = (src_tile_start0 + src_i0); + const std::size_t src_gid1 = (src_tile_start1 + src_i1); + + // src_offset = src_gid0 * 1 + (src_gid1 + pr_id * lws1) * + // src_stride + const std::size_t src_offset0 = + src_batch_offset + src_gid0 * 1 + src_gid1 * src_stride; + const std::size_t pr_step_src = lws1 * src_stride; + + const std::uint32_t local_offset0 = src_i0 + src_i1 * block_size; + const std::uint32_t pr_step_local = lws1 * block_size; + + for (std::uint32_t pr_id = 0; pr_id < nelems_per_wi; ++pr_id) { + local_block[local_offset0 + pr_step_local * pr_id] = + (src_gid0 < n && src_gid1 + pr_id * lws1 < n) + ? src_tp[src_offset0 + pr_step_src * pr_id] + : T(0); + } + + const std::uint32_t local_dim0 = static_cast( + std::min(src_tile_start0 + block_size, n) - + src_tile_start0); + const std::uint32_t local_dim1 = static_cast( + std::min(src_tile_start1 + block_size, n) - + src_tile_start1); + + sycl::group_barrier(nd_it.get_group(), + sycl::memory_scope::work_group); + + // 2. Permute the block matrix in SLM using two private arrays + std::array private_block_01 = {T(0)}; + std::array private_block_10 = {T(0)}; + + // 0 <= lid_lin < lws0 * lws1 == + // (block_size * block_size / nelems_per_wi) == + // (block_size/private_tile_size)**2 + static constexpr std::uint16_t n_private_tiles_per_axis = + block_size / private_tile_size; + const std::uint16_t local_tile_id0 = + lid_lin / n_private_tiles_per_axis; + const std::uint16_t local_tile_id1 = + lid_lin - local_tile_id0 * n_private_tiles_per_axis; + + if (local_tile_id0 <= local_tile_id1) { + for (std::uint16_t pr_i0 = 0; pr_i0 < private_tile_size; + ++pr_i0) { + for (std::uint16_t pr_i1 = 0; pr_i1 < private_tile_size; + ++pr_i1) { + const std::uint16_t t0_offset = + local_tile_id0 * private_tile_size; + const std::uint16_t t1_offset = + local_tile_id1 * private_tile_size; + + const std::uint16_t pr_offset = + pr_i1 * private_tile_size + pr_i0; + const std::uint16_t rel_offset = + pr_i0 + pr_i1 * block_size; + + // read (local_tile_id0, local_tile_id1) + const std::uint16_t local_01_offset = + (t0_offset + t1_offset * block_size) + rel_offset; + private_block_01[pr_offset] = + local_block[local_01_offset]; + + // read (local_tile_id1, local_tile_id0) + const std::uint16_t local_10_offset = + (t1_offset + t0_offset * block_size) + rel_offset; + private_block_10[pr_offset] = + local_block[local_10_offset]; + } + } + } + + sycl::group_barrier(nd_it.get_group(), + sycl::memory_scope::work_group); + + if (local_tile_id0 <= local_tile_id1) { + for (std::uint16_t pr_i0 = 0; pr_i0 < private_tile_size; + ++pr_i0) { + for (std::uint16_t pr_i1 = 0; pr_i1 < private_tile_size; + ++pr_i1) { + const std::uint16_t t0_offset = + local_tile_id0 * private_tile_size; + const std::uint16_t t1_offset = + local_tile_id1 * private_tile_size; + const std::uint16_t pr_offset = + pr_i0 * private_tile_size + pr_i1; + + const std::uint16_t rel_offset = + pr_i0 + pr_i1 * block_size; + + // write back permuted private blocks + const std::uint32_t local_01_offset = + (t0_offset + t1_offset * block_size) + rel_offset; + local_block[local_01_offset] = + private_block_10[pr_offset]; + + const std::uint16_t local_10_offset = + (t1_offset + t0_offset * block_size) + rel_offset; + local_block[local_10_offset] = + private_block_01[pr_offset]; + } + } + } + + sycl::group_barrier(nd_it.get_group(), + sycl::memory_scope::work_group); + + // 3. Write out permuted SLM to destination array + + const std::size_t dst_tile_start0 = src_tile_start0; + const std::size_t dst_tile_start1 = src_tile_start1; + + if (local_dim0 == block_size && local_dim1 == block_size) { + const std::uint16_t dst_i0 = src_i1; + const std::uint16_t dst_i1 = src_i0; + + const std::size_t dst_gid0 = (dst_tile_start0 + dst_i0); + const std::size_t dst_gid1 = (dst_tile_start1 + dst_i1); + + const std::size_t dst_offset0 = + dst_batch_offset + dst_gid0 * dst_stride + dst_gid1 * 1; + const std::size_t pr_step_dst = lws1 * dst_stride; + + const std::uint16_t _local_offset0 = + dst_i0 * block_size + dst_i1; + const std::uint16_t _pr_step_local = lws1 * block_size; + + for (std::uint16_t pr_id = 0; pr_id < nelems_per_wi; ++pr_id) { + if ((dst_gid1 < n) && ((dst_gid0 + pr_id * lws1) < n)) { + dst_tp[dst_offset0 + pr_step_dst * pr_id] = + local_block[_local_offset0 + + _pr_step_local * pr_id]; + } + } + } + else { + // map local_linear_id into (local_dim0, local_dim1) + for (std::uint16_t el_id = lid_lin; + el_id < local_dim0 * local_dim1; el_id += lws0 * lws1) { + + // 0 <= local_i0 < local_dim0 + const std::uint16_t loc_i0 = el_id / local_dim1; + // 0 <= local_i1 < local_dim1 + const std::uint16_t loc_i1 = el_id - loc_i0 * local_dim1; + + const std::uint16_t dst_i0 = loc_i0; + const std::uint16_t dst_i1 = loc_i1; + + const std::size_t dst_gid0 = (dst_tile_start0 + dst_i0); + const std::size_t dst_gid1 = (dst_tile_start1 + dst_i1); + + const std::size_t dst_offset = + dst_batch_offset + dst_gid0 * dst_stride + dst_gid1 * 1; + const std::uint16_t local_offset = + loc_i0 * block_size + loc_i1; + + if ((dst_gid1 < n) && (dst_gid0 < n)) { + dst_tp[dst_offset] = local_block[local_offset]; + } + } + } + }); + }); + + return e; +} + +} // end of namespace detail + +template +sycl::event as_c_contiguous_1d_batch_of_square_matrices_impl( + sycl::queue &exec_q, + std::size_t batch_nelems, + ssize_t src_batch_step, + ssize_t dst_batch_step, + std::size_t n, + const char *src_p, + ssize_t src_ld, + char *dst_p, + ssize_t dst_ld, + const std::vector &depends) +{ + using dpctl::tensor::offset_utils::Strided1DIndexer; + using dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer; + using BatchIndexerT = + TwoOffsets_CombinedIndexer; + + const auto &src_batch_indexer = + Strided1DIndexer(batch_nelems, src_batch_step); + const auto &dst_batch_indexer = + Strided1DIndexer(batch_nelems, dst_batch_step); + + const BatchIndexerT batch_two_indexer{src_batch_indexer, dst_batch_indexer}; + + return detail::as_c_contiguous_batch_of_square_matrices_impl( + exec_q, batch_nelems, batch_two_indexer, n, src_p, src_ld, dst_p, + dst_ld, depends); +} + +typedef sycl::event ( + *as_c_contiguous_1d_batch_of_square_matrices_impl_fn_ptr_t)( + sycl::queue &, /* execution queue */ + std::size_t, /* number of batch elements */ + ssize_t, /* distance between batches in source array */ + ssize_t, /* distance between batches in destination array */ + std::size_t, /* size of square matrices in the batch */ + const char *, + ssize_t, /* untyped pointer to F-contig source array, and matrix leading + dimension */ + char *, + ssize_t, /* untyped pointer to C-contig destination array, and matrix + leading dimension */ + const std::vector &); + +template +struct AsCContig1DBatchOfSquareMatricesFactory +{ + fnT get() { return as_c_contiguous_1d_batch_of_square_matrices_impl; } +}; + +template +sycl::event as_c_contiguous_nd_batch_of_square_matrices_impl( + sycl::queue &exec_q, + std::size_t batch_nelems, + int batch_nd, + const ssize_t *src_batch_shape_strides, + const ssize_t dst_batch_step, + std::size_t n, + const char *src_p, + ssize_t src_ld, + char *dst_p, + ssize_t dst_ld, + const std::vector &depends) +{ + using SrcIndexerT = dpctl::tensor::offset_utils::StridedIndexer; + using DstIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer; + using dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer; + using BatchIndexerT = TwoOffsets_CombinedIndexer; + + static constexpr ssize_t zero_offset{0}; + + const SrcIndexerT src_batch_indexer{batch_nd, zero_offset, + src_batch_shape_strides}; + const DstIndexerT dst_batch_indexer{/* size */ batch_nelems, + /* step */ dst_batch_step}; + + const BatchIndexerT batch_two_offsets_indexer{src_batch_indexer, + dst_batch_indexer}; + + return detail::as_c_contiguous_batch_of_square_matrices_impl( + exec_q, batch_nelems, batch_two_offsets_indexer, n, src_p, src_ld, + dst_p, dst_ld, depends); +} + +typedef sycl::event ( + *as_c_contiguous_nd_batch_of_square_matrices_impl_fn_ptr_t)( + sycl::queue &, /* execution queue */ + std::size_t, /* number of matrices in the batch */ + int, + const ssize_t *, /* dimensionality, and packed [shape, src_strides] + describing iteration over batch in source array */ + ssize_t, /* distance between batches in destination array */ + std::size_t, /* matrix size */ + const char *, + ssize_t, /* untyped pointer to source array of F-contig matrices, and + leading dimension of the matrix */ + char *, + ssize_t, /* untyped pointer to destination array of F-contig matrices, and + leading dimension of the matrix */ + const std::vector &); + +template +struct AsCContigNDBatchOfSquareMatricesFactory +{ + fnT get() { return as_c_contiguous_nd_batch_of_square_matrices_impl; } +}; +} // namespace dpctl::tensor::kernels::copy_as_contig diff --git a/dpnp/tensor/libtensor/include/kernels/dpctl_tensor_types.hpp b/dpnp/tensor/libtensor/include/kernels/dpctl_tensor_types.hpp new file mode 100644 index 000000000000..4db78e1805e3 --- /dev/null +++ b/dpnp/tensor/libtensor/include/kernels/dpctl_tensor_types.hpp @@ -0,0 +1,40 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===--------------------------------------------------------------------===// + +#pragma once + +#include + +namespace dpctl::tensor +{ +typedef std::ptrdiff_t ssize_t; +} // namespace dpctl::tensor diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/abs.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/abs.hpp new file mode 100644 index 000000000000..250ba1d70455 --- /dev/null +++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/abs.hpp @@ -0,0 +1,237 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for elementwise evaluation of ABS(x) function. +//===---------------------------------------------------------------------===// + +#pragma once +#include +#include +#include +#include +#include + +#include + +#include "cabs_impl.hpp" +#include "vec_size_util.hpp" + +#include "kernels/dpctl_tensor_types.hpp" +#include "kernels/elementwise_functions/common.hpp" + +#include "utils/type_dispatch_building.hpp" +#include "utils/type_utils.hpp" + +namespace dpctl::tensor::kernels::abs +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +using dpctl::tensor::ssize_t; +using dpctl::tensor::type_utils::is_complex; + +template +struct AbsFunctor +{ + + using is_constant = typename std::false_type; + // constexpr resT constant_value = resT{}; + using supports_vec = typename std::false_type; + using supports_sg_loadstore = typename std::negation< + std::disjunction, is_complex>>; + + resT operator()(const argT &x) const + { + + if constexpr (std::is_same_v || + (std::is_integral::value && + std::is_unsigned::value)) { + static_assert(std::is_same_v); + return x; + } + else { + if constexpr (is_complex::value) { + return detail::cabs(x); + } + else if constexpr (std::is_same_v || + std::is_floating_point_v) { + return (sycl::signbit(x) ? -x : x); + } + else { + return sycl::abs(x); + } + } + } +}; + +template +using AbsContigFunctor = + elementwise_common::UnaryContigFunctor, + vec_sz, + n_vecs, + enable_sg_loadstore>; + +template +struct AbsOutputType +{ + using value_type = typename std::disjunction< + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, float>, + td_ns::TypeMapResultEntry, double>, + td_ns::DefaultResultEntry>::result_type; + + static constexpr bool is_defined = !std::is_same_v; +}; + +namespace hyperparam_detail +{ + +namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; + +using vsu_ns::ContigHyperparameterSetDefault; + +template +struct AbsContigHyperparameterSet +{ + using value_type = + typename std::disjunction>; + + constexpr static auto vec_sz = value_type::vec_sz; + constexpr static auto n_vecs = value_type::n_vecs; +}; + +} // namespace hyperparam_detail + +template +class abs_contig_kernel; + +template +sycl::event abs_contig_impl(sycl::queue &exec_q, + std::size_t nelems, + const char *arg_p, + char *res_p, + const std::vector &depends = {}) +{ + using AbsHS = hyperparam_detail::AbsContigHyperparameterSet; + static constexpr std::uint8_t vec_sz = AbsHS::vec_sz; + static constexpr std::uint8_t n_vec = AbsHS::n_vecs; + + return elementwise_common::unary_contig_impl< + argTy, AbsOutputType, AbsContigFunctor, abs_contig_kernel, vec_sz, + n_vec>(exec_q, nelems, arg_p, res_p, depends); +} + +template +struct AbsContigFactory +{ + fnT get() + { + if constexpr (!AbsOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = abs_contig_impl; + return fn; + } + } +}; + +template +struct AbsTypeMapFactory +{ + /*! @brief get typeid for output type of abs(T x) */ + std::enable_if_t::value, int> get() + { + using rT = typename AbsOutputType::value_type; + return td_ns::GetTypeid{}.get(); + } +}; + +template +using AbsStridedFunctor = elementwise_common:: + UnaryStridedFunctor>; + +template +class abs_strided_kernel; + +template +sycl::event abs_strided_impl(sycl::queue &exec_q, + std::size_t nelems, + int nd, + const ssize_t *shape_and_strides, + const char *arg_p, + ssize_t arg_offset, + char *res_p, + ssize_t res_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + return elementwise_common::unary_strided_impl< + argTy, AbsOutputType, AbsStridedFunctor, abs_strided_kernel>( + exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p, + res_offset, depends, additional_depends); +} + +template +struct AbsStridedFactory +{ + fnT get() + { + if constexpr (!AbsOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = abs_strided_impl; + return fn; + } + } +}; + +} // namespace dpctl::tensor::kernels::abs diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/acos.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/acos.hpp new file mode 100644 index 000000000000..9ceeb0947439 --- /dev/null +++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/acos.hpp @@ -0,0 +1,273 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for elementwise evaluation of ACOS(x) function. +//===---------------------------------------------------------------------===// + +#pragma once +#include +#include +#include +#include +#include +#include +#include + +#include "sycl_complex.hpp" +#include "vec_size_util.hpp" + +#include "kernels/dpctl_tensor_types.hpp" +#include "kernels/elementwise_functions/common.hpp" + +#include "utils/type_dispatch_building.hpp" +#include "utils/type_utils.hpp" + +namespace dpctl::tensor::kernels::acos +{ + +using dpctl::tensor::ssize_t; +namespace td_ns = dpctl::tensor::type_dispatch; + +using dpctl::tensor::type_utils::is_complex; + +template +struct AcosFunctor +{ + + // is function constant for given argT + using is_constant = typename std::false_type; + // constant value, if constant + // constexpr resT constant_value = resT{}; + // is function defined for sycl::vec + using supports_vec = typename std::false_type; + // do both argTy and resTy support sugroup store/load operation + using supports_sg_loadstore = typename std::negation< + std::disjunction, is_complex>>; + + resT operator()(const argT &in) const + { + if constexpr (is_complex::value) { + using realT = typename argT::value_type; + + static constexpr realT q_nan = + std::numeric_limits::quiet_NaN(); + + const realT x = std::real(in); + const realT y = std::imag(in); + + if (std::isnan(x)) { + /* acos(NaN + I*+-Inf) = NaN + I*-+Inf */ + if (std::isinf(y)) { + return resT{q_nan, -y}; + } + + /* all other cases involving NaN return NaN + I*NaN. */ + return resT{q_nan, q_nan}; + } + if (std::isnan(y)) { + /* acos(+-Inf + I*NaN) = NaN + I*opt(-)Inf */ + if (std::isinf(x)) { + return resT{q_nan, -std::numeric_limits::infinity()}; + } + /* acos(0 + I*NaN) = PI/2 + I*NaN with inexact */ + if (x == realT(0)) { + const realT res_re = sycl::atan(realT(1)) * 2; // PI/2 + return resT{res_re, q_nan}; + } + + /* all other cases involving NaN return NaN + I*NaN. */ + return resT{q_nan, q_nan}; + } + + /* + * For large x or y including acos(+-Inf + I*+-Inf) + */ + static constexpr realT r_eps = + realT(1) / std::numeric_limits::epsilon(); + if (sycl::fabs(x) > r_eps || sycl::fabs(y) > r_eps) { + using sycl_complexT = exprm_ns::complex; + sycl_complexT log_in = + exprm_ns::log(exprm_ns::complex(in)); + + const realT wx = log_in.real(); + const realT wy = log_in.imag(); + const realT rx = sycl::fabs(wy); + + realT ry = wx + sycl::log(realT(2)); + return resT{rx, (sycl::signbit(y)) ? ry : -ry}; + } + + /* ordinary cases */ + return exprm_ns::acos(exprm_ns::complex(in)); // acos(in); + } + else { + static_assert(std::is_floating_point_v || + std::is_same_v); + return sycl::acos(in); + } + } +}; + +template +using AcosContigFunctor = + elementwise_common::UnaryContigFunctor, + vec_sz, + n_vecs, + enable_sg_loadstore>; + +template +using AcosStridedFunctor = elementwise_common:: + UnaryStridedFunctor>; + +template +struct AcosOutputType +{ + using value_type = typename std::disjunction< + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry>, + td_ns::TypeMapResultEntry>, + td_ns::DefaultResultEntry>::result_type; + + static constexpr bool is_defined = !std::is_same_v; +}; + +namespace hyperparam_detail +{ + +namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; + +using vsu_ns::ContigHyperparameterSetDefault; + +template +struct AcosContigHyperparameterSet +{ + using value_type = + typename std::disjunction>; + + constexpr static auto vec_sz = value_type::vec_sz; + constexpr static auto n_vecs = value_type::n_vecs; +}; + +} // end of namespace hyperparam_detail + +template +class acos_contig_kernel; + +template +sycl::event acos_contig_impl(sycl::queue &exec_q, + std::size_t nelems, + const char *arg_p, + char *res_p, + const std::vector &depends = {}) +{ + using AcosHS = hyperparam_detail::AcosContigHyperparameterSet; + static constexpr std::uint8_t vec_sz = AcosHS::vec_sz; + static constexpr std::uint8_t n_vec = AcosHS::n_vecs; + + return elementwise_common::unary_contig_impl< + argTy, AcosOutputType, AcosContigFunctor, acos_contig_kernel, vec_sz, + n_vec>(exec_q, nelems, arg_p, res_p, depends); +} + +template +struct AcosContigFactory +{ + fnT get() + { + if constexpr (!AcosOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = acos_contig_impl; + return fn; + } + } +}; + +template +struct AcosTypeMapFactory +{ + /*! @brief get typeid for output type of sycl::acos(T x) */ + std::enable_if_t::value, int> get() + { + using rT = typename AcosOutputType::value_type; + return td_ns::GetTypeid{}.get(); + } +}; + +template +class acos_strided_kernel; + +template +sycl::event + acos_strided_impl(sycl::queue &exec_q, + std::size_t nelems, + int nd, + const ssize_t *shape_and_strides, + const char *arg_p, + ssize_t arg_offset, + char *res_p, + ssize_t res_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + return elementwise_common::unary_strided_impl< + argTy, AcosOutputType, AcosStridedFunctor, acos_strided_kernel>( + exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p, + res_offset, depends, additional_depends); +} + +template +struct AcosStridedFactory +{ + fnT get() + { + if constexpr (!AcosOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = acos_strided_impl; + return fn; + } + } +}; + +} // namespace dpctl::tensor::kernels::acos diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/acosh.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/acosh.hpp new file mode 100644 index 000000000000..e356b37361d8 --- /dev/null +++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/acosh.hpp @@ -0,0 +1,304 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for elementwise evaluation of ACOSH(x) function. +//===---------------------------------------------------------------------===// + +#pragma once +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "sycl_complex.hpp" +#include "vec_size_util.hpp" + +#include "kernels/dpctl_tensor_types.hpp" +#include "kernels/elementwise_functions/common.hpp" + +#include "utils/type_dispatch_building.hpp" +#include "utils/type_utils.hpp" + +namespace dpctl::tensor::kernels::acosh +{ + +using dpctl::tensor::ssize_t; +namespace td_ns = dpctl::tensor::type_dispatch; + +using dpctl::tensor::type_utils::is_complex; + +template +struct AcoshFunctor +{ + + // is function constant for given argT + using is_constant = typename std::false_type; + // constant value, if constant + // constexpr resT constant_value = resT{}; + // is function defined for sycl::vec + using supports_vec = typename std::false_type; + // do both argTy and resTy support sugroup store/load operation + using supports_sg_loadstore = typename std::negation< + std::disjunction, is_complex>>; + + resT operator()(const argT &in) const + { + if constexpr (is_complex::value) { + using realT = typename argT::value_type; + + static constexpr realT q_nan = + std::numeric_limits::quiet_NaN(); + /* + * acosh(in) = I*acos(in) or -I*acos(in) + * where the sign is chosen so Re(acosh(in)) >= 0. + * So, we first calculate acos(in) and then acosh(in). + */ + const realT x = std::real(in); + const realT y = std::imag(in); + + resT acos_in; + if (std::isnan(x)) { + /* acos(NaN + I*+-Inf) = NaN + I*-+Inf */ + if (std::isinf(y)) { + acos_in = resT{q_nan, -y}; + } + else { + acos_in = resT{q_nan, q_nan}; + } + } + else if (std::isnan(y)) { + /* acos(+-Inf + I*NaN) = NaN + I*opt(-)Inf */ + static constexpr realT inf = + std::numeric_limits::infinity(); + + if (std::isinf(x)) { + acos_in = resT{q_nan, -inf}; + } + /* acos(0 + I*NaN) = Pi/2 + I*NaN with inexact */ + else if (x == realT(0)) { + const realT pi_half = sycl::atan(realT(1)) * 2; + acos_in = resT{pi_half, q_nan}; + } + else { + acos_in = resT{q_nan, q_nan}; + } + } + + static constexpr realT r_eps = + realT(1) / std::numeric_limits::epsilon(); + /* + * For large x or y including acos(+-Inf + I*+-Inf) + */ + if (sycl::fabs(x) > r_eps || sycl::fabs(y) > r_eps) { + using sycl_complexT = typename exprm_ns::complex; + const sycl_complexT log_in = exprm_ns::log(sycl_complexT(in)); + const realT wx = log_in.real(); + const realT wy = log_in.imag(); + const realT rx = sycl::fabs(wy); + realT ry = wx + sycl::log(realT(2)); + acos_in = resT{rx, (sycl::signbit(y)) ? ry : -ry}; + } + else { + /* ordinary cases */ + acos_in = + exprm_ns::acos(exprm_ns::complex(in)); // acos(in); + } + + /* Now we calculate acosh(z) */ + const realT rx = std::real(acos_in); + const realT ry = std::imag(acos_in); + + /* acosh(NaN + I*NaN) = NaN + I*NaN */ + if (std::isnan(rx) && std::isnan(ry)) { + return resT{ry, rx}; + } + /* acosh(NaN + I*+-Inf) = +Inf + I*NaN */ + /* acosh(+-Inf + I*NaN) = +Inf + I*NaN */ + if (std::isnan(rx)) { + return resT{sycl::fabs(ry), rx}; + } + /* acosh(0 + I*NaN) = NaN + I*NaN */ + if (std::isnan(ry)) { + return resT{ry, ry}; + } + /* ordinary cases */ + const realT res_im = sycl::copysign(rx, std::imag(in)); + return resT{sycl::fabs(ry), res_im}; + } + else { + static_assert(std::is_floating_point_v || + std::is_same_v); + return sycl::acosh(in); + } + } +}; + +template +using AcoshContigFunctor = + elementwise_common::UnaryContigFunctor, + vec_sz, + n_vecs, + enable_sg_loadstore>; + +template +using AcoshStridedFunctor = elementwise_common:: + UnaryStridedFunctor>; + +template +struct AcoshOutputType +{ + using value_type = typename std::disjunction< + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry>, + td_ns::TypeMapResultEntry>, + td_ns::DefaultResultEntry>::result_type; + + static constexpr bool is_defined = !std::is_same_v; +}; + +namespace hyperparam_detail +{ + +namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; + +using vsu_ns::ContigHyperparameterSetDefault; +using vsu_ns::UnaryContigHyperparameterSetEntry; + +template +struct AcoshContigHyperparameterSet +{ + using value_type = + typename std::disjunction>; + + constexpr static auto vec_sz = value_type::vec_sz; + constexpr static auto n_vecs = value_type::n_vecs; +}; + +} // end of namespace hyperparam_detail + +template +class acosh_contig_kernel; + +template +sycl::event acosh_contig_impl(sycl::queue &exec_q, + std::size_t nelems, + const char *arg_p, + char *res_p, + const std::vector &depends = {}) +{ + using AcoshHS = hyperparam_detail::AcoshContigHyperparameterSet; + static constexpr std::uint8_t vec_sz = AcoshHS::vec_sz; + static constexpr std::uint8_t n_vec = AcoshHS::n_vecs; + + return elementwise_common::unary_contig_impl< + argTy, AcoshOutputType, AcoshContigFunctor, acosh_contig_kernel, vec_sz, + n_vec>(exec_q, nelems, arg_p, res_p, depends); +} + +template +struct AcoshContigFactory +{ + fnT get() + { + if constexpr (!AcoshOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = acosh_contig_impl; + return fn; + } + } +}; + +template +struct AcoshTypeMapFactory +{ + /*! @brief get typeid for output type of sycl::acosh(T x) */ + std::enable_if_t::value, int> get() + { + using rT = typename AcoshOutputType::value_type; + return td_ns::GetTypeid{}.get(); + } +}; + +template +class acosh_strided_kernel; + +template +sycl::event + acosh_strided_impl(sycl::queue &exec_q, + std::size_t nelems, + int nd, + const ssize_t *shape_and_strides, + const char *arg_p, + ssize_t arg_offset, + char *res_p, + ssize_t res_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + return elementwise_common::unary_strided_impl< + argTy, AcoshOutputType, AcoshStridedFunctor, acosh_strided_kernel>( + exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p, + res_offset, depends, additional_depends); +} + +template +struct AcoshStridedFactory +{ + fnT get() + { + if constexpr (!AcoshOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = acosh_strided_impl; + return fn; + } + } +}; + +} // namespace dpctl::tensor::kernels::acosh diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/add.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/add.hpp new file mode 100644 index 000000000000..c7386f99236a --- /dev/null +++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/add.hpp @@ -0,0 +1,679 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for elementwise evaluation of ADD(x1, x2) +/// function. +//===---------------------------------------------------------------------===// + +#pragma once +#include +#include +#include +#include +#include + +#include + +#include "sycl_complex.hpp" +#include "vec_size_util.hpp" + +#include "kernels/dpctl_tensor_types.hpp" +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/common_inplace.hpp" + +#include "utils/type_dispatch_building.hpp" +#include "utils/type_utils.hpp" + +namespace dpctl::tensor::kernels::add +{ + +using dpctl::tensor::ssize_t; +namespace td_ns = dpctl::tensor::type_dispatch; +namespace tu_ns = dpctl::tensor::type_utils; + +template +struct AddFunctor +{ + + using supports_sg_loadstore = std::negation< + std::disjunction, tu_ns::is_complex>>; + using supports_vec = std::negation< + std::disjunction, tu_ns::is_complex>>; + + resT operator()(const argT1 &in1, const argT2 &in2) const + { + if constexpr (tu_ns::is_complex::value && + tu_ns::is_complex::value) { + using rT1 = typename argT1::value_type; + using rT2 = typename argT2::value_type; + + return exprm_ns::complex(in1) + exprm_ns::complex(in2); + } + else if constexpr (tu_ns::is_complex::value && + !tu_ns::is_complex::value) { + using rT1 = typename argT1::value_type; + + return exprm_ns::complex(in1) + in2; + } + else if constexpr (!tu_ns::is_complex::value && + tu_ns::is_complex::value) { + using rT2 = typename argT2::value_type; + + return in1 + exprm_ns::complex(in2); + } + else { + return in1 + in2; + } + } + + template + sycl::vec + operator()(const sycl::vec &in1, + const sycl::vec &in2) const + { + auto tmp = in1 + in2; + if constexpr (std::is_same_v) { + return tmp; + } + else { + using dpctl::tensor::type_utils::vec_cast; + + return vec_cast( + tmp); + } + } +}; + +template +using AddContigFunctor = + elementwise_common::BinaryContigFunctor, + vec_sz, + n_vecs, + enable_sg_loadstore>; + +template +using AddStridedFunctor = + elementwise_common::BinaryStridedFunctor>; + +template +struct AddOutputType +{ + using value_type = typename std::disjunction< + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + T2, + std::complex, + std::complex>, + td_ns::BinaryTypeMapResultEntry, + T2, + std::complex, + std::complex>, + td_ns::DefaultResultEntry>::result_type; + + static constexpr bool is_defined = !std::is_same_v; +}; + +namespace hyperparam_detail +{ + +namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; + +using vsu_ns::BinaryContigHyperparameterSetEntry; +using vsu_ns::ContigHyperparameterSetDefault; + +template +struct AddContigHyperparameterSet +{ + using value_type = typename std::disjunction< + BinaryContigHyperparameterSetEntry, + BinaryContigHyperparameterSetEntry, + BinaryContigHyperparameterSetEntry, + BinaryContigHyperparameterSetEntry, + BinaryContigHyperparameterSetEntry, + BinaryContigHyperparameterSetEntry, + ContigHyperparameterSetDefault<4u, 2u>>; + + constexpr static auto vec_sz = value_type::vec_sz; + constexpr static auto n_vecs = value_type::n_vecs; +}; + +} // end of namespace hyperparam_detail + +template +class add_contig_kernel; + +template +sycl::event add_contig_impl(sycl::queue &exec_q, + std::size_t nelems, + const char *arg1_p, + ssize_t arg1_offset, + const char *arg2_p, + ssize_t arg2_offset, + char *res_p, + ssize_t res_offset, + const std::vector &depends = {}) +{ + using AddHS = hyperparam_detail::AddContigHyperparameterSet; + static constexpr auto vec_sz = AddHS::vec_sz; + static constexpr auto n_vecs = AddHS::n_vecs; + + return elementwise_common::binary_contig_impl< + argTy1, argTy2, AddOutputType, AddContigFunctor, add_contig_kernel, + vec_sz, n_vecs>(exec_q, nelems, arg1_p, arg1_offset, arg2_p, + arg2_offset, res_p, res_offset, depends); +} + +template +struct AddContigFactory +{ + fnT get() + { + if constexpr (!AddOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = add_contig_impl; + return fn; + } + } +}; + +template +struct AddTypeMapFactory +{ + /*! @brief get typeid for output type of std::add(T1 x, T2 y) */ + std::enable_if_t::value, int> get() + { + using rT = typename AddOutputType::value_type; + return td_ns::GetTypeid{}.get(); + } +}; + +template +class add_strided_kernel; + +template +sycl::event add_strided_impl(sycl::queue &exec_q, + std::size_t nelems, + int nd, + const ssize_t *shape_and_strides, + const char *arg1_p, + ssize_t arg1_offset, + const char *arg2_p, + ssize_t arg2_offset, + char *res_p, + ssize_t res_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + return elementwise_common::binary_strided_impl< + argTy1, argTy2, AddOutputType, AddStridedFunctor, add_strided_kernel>( + exec_q, nelems, nd, shape_and_strides, arg1_p, arg1_offset, arg2_p, + arg2_offset, res_p, res_offset, depends, additional_depends); +} + +template +struct AddStridedFactory +{ + fnT get() + { + if constexpr (!AddOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = add_strided_impl; + return fn; + } + } +}; + +template +class add_matrix_row_broadcast_sg_krn; + +template +using AddContigMatrixContigRowBroadcastingFunctor = + elementwise_common::BinaryContigMatrixContigRowBroadcastingFunctor< + argT1, + argT2, + resT, + AddFunctor>; + +template +sycl::event add_contig_matrix_contig_row_broadcast_impl( + sycl::queue &exec_q, + std::vector &host_tasks, + std::size_t n0, + std::size_t n1, + const char *mat_p, // typeless pointer to (n0, n1) C-contiguous matrix + ssize_t mat_offset, + const char *vec_p, // typeless pointer to (n1,) contiguous row + ssize_t vec_offset, + char *res_p, // typeless pointer to (n0, n1) result C-contig. matrix, + // res[i,j] = mat[i,j] + vec[j] + ssize_t res_offset, + const std::vector &depends = {}) +{ + return elementwise_common::binary_contig_matrix_contig_row_broadcast_impl< + argT1, argT2, resT, AddContigMatrixContigRowBroadcastingFunctor, + add_matrix_row_broadcast_sg_krn>(exec_q, host_tasks, n0, n1, mat_p, + mat_offset, vec_p, vec_offset, res_p, + res_offset, depends); +} + +template +struct AddContigMatrixContigRowBroadcastFactory +{ + fnT get() + { + if constexpr (!AddOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + using resT = typename AddOutputType::value_type; + if constexpr (dpctl::tensor::type_utils::is_complex::value || + dpctl::tensor::type_utils::is_complex::value || + dpctl::tensor::type_utils::is_complex::value) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = + add_contig_matrix_contig_row_broadcast_impl; + return fn; + } + } + } +}; + +template +sycl::event add_contig_row_contig_matrix_broadcast_impl( + sycl::queue &exec_q, + std::vector &host_tasks, + std::size_t n0, + std::size_t n1, + const char *vec_p, // typeless pointer to (n1,) contiguous row + ssize_t vec_offset, + const char *mat_p, // typeless pointer to (n0, n1) C-contiguous matrix + ssize_t mat_offset, + char *res_p, // typeless pointer to (n0, n1) result C-contig. matrix, + // res[i,j] = mat[i,j] + vec[j] + ssize_t res_offset, + const std::vector &depends = {}) +{ + return add_contig_matrix_contig_row_broadcast_impl( + exec_q, host_tasks, n0, n1, mat_p, mat_offset, vec_p, vec_offset, res_p, + res_offset, depends); +}; + +template +struct AddContigRowContigMatrixBroadcastFactory +{ + fnT get() + { + if constexpr (!AddOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + using resT = typename AddOutputType::value_type; + if constexpr (dpctl::tensor::type_utils::is_complex::value || + dpctl::tensor::type_utils::is_complex::value || + dpctl::tensor::type_utils::is_complex::value) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = + add_contig_row_contig_matrix_broadcast_impl; + return fn; + } + } + } +}; + +template +struct AddInplaceFunctor +{ + + using supports_sg_loadstore = std::negation< + std::disjunction, tu_ns::is_complex>>; + using supports_vec = std::negation< + std::disjunction, tu_ns::is_complex>>; + + void operator()(resT &res, const argT &in) { res += in; } + + template + void operator()(sycl::vec &res, + const sycl::vec &in) + { + res += in; + } +}; + +template +using AddInplaceContigFunctor = elementwise_common::BinaryInplaceContigFunctor< + argT, + resT, + AddInplaceFunctor, + vec_sz, + n_vecs, + enable_sg_loadstore>; + +template +using AddInplaceStridedFunctor = + elementwise_common::BinaryInplaceStridedFunctor< + argT, + resT, + IndexerT, + AddInplaceFunctor>; + +template +class add_inplace_contig_kernel; + +/* @brief Types supported by in-place add */ +template +struct AddInplaceTypePairSupport +{ + /* value if true a kernel for must be instantiated */ + static constexpr bool is_defined = std::disjunction< + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + resTy, + std::complex>, + td_ns::TypePairDefinedEntry, + resTy, + std::complex>, + // fall-through + td_ns::NotDefinedEntry>::is_defined; +}; + +template +struct AddInplaceTypeMapFactory +{ + /*! @brief get typeid for output type of x += y */ + std::enable_if_t::value, int> get() + { + if constexpr (AddInplaceTypePairSupport::is_defined) { + return td_ns::GetTypeid{}.get(); + } + else { + return td_ns::GetTypeid{}.get(); + } + } +}; + +template +sycl::event + add_inplace_contig_impl(sycl::queue &exec_q, + std::size_t nelems, + const char *arg_p, + ssize_t arg_offset, + char *res_p, + ssize_t res_offset, + const std::vector &depends = {}) +{ + static constexpr auto vec_sz = + hyperparam_detail::AddContigHyperparameterSet::vec_sz; + static constexpr auto n_vecs = + hyperparam_detail::AddContigHyperparameterSet::n_vecs; + + return elementwise_common::binary_inplace_contig_impl< + argTy, resTy, AddInplaceContigFunctor, add_inplace_contig_kernel, + vec_sz, n_vecs>(exec_q, nelems, arg_p, arg_offset, res_p, res_offset, + depends); +} + +template +struct AddInplaceContigFactory +{ + fnT get() + { + if constexpr (!AddInplaceTypePairSupport::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = add_inplace_contig_impl; + return fn; + } + } +}; + +template +class add_inplace_strided_kernel; + +template +sycl::event + add_inplace_strided_impl(sycl::queue &exec_q, + std::size_t nelems, + int nd, + const ssize_t *shape_and_strides, + const char *arg_p, + ssize_t arg_offset, + char *res_p, + ssize_t res_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + return elementwise_common::binary_inplace_strided_impl< + argTy, resTy, AddInplaceStridedFunctor, add_inplace_strided_kernel>( + exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p, + res_offset, depends, additional_depends); +} + +template +struct AddInplaceStridedFactory +{ + fnT get() + { + if constexpr (!AddInplaceTypePairSupport::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = add_inplace_strided_impl; + return fn; + } + } +}; + +template +class add_inplace_row_matrix_broadcast_sg_krn; + +template +using AddInplaceRowMatrixBroadcastingFunctor = + elementwise_common::BinaryInplaceRowMatrixBroadcastingFunctor< + argT, + resT, + AddInplaceFunctor>; + +template +sycl::event add_inplace_row_matrix_broadcast_impl( + sycl::queue &exec_q, + std::vector &host_tasks, + std::size_t n0, + std::size_t n1, + const char *vec_p, // typeless pointer to (n1,) contiguous row + ssize_t vec_offset, + char *mat_p, // typeless pointer to (n0, n1) C-contiguous matrix + ssize_t mat_offset, + const std::vector &depends = {}) +{ + return elementwise_common::binary_inplace_row_matrix_broadcast_impl< + argT, resT, AddInplaceRowMatrixBroadcastingFunctor, + add_inplace_row_matrix_broadcast_sg_krn>(exec_q, host_tasks, n0, n1, + vec_p, vec_offset, mat_p, + mat_offset, depends); +} + +template +struct AddInplaceRowMatrixBroadcastFactory +{ + fnT get() + { + if constexpr (!AddInplaceTypePairSupport::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + if constexpr (dpctl::tensor::type_utils::is_complex::value || + dpctl::tensor::type_utils::is_complex::value) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = add_inplace_row_matrix_broadcast_impl; + return fn; + } + } + } +}; + +} // namespace dpctl::tensor::kernels::add diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/angle.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/angle.hpp new file mode 100644 index 000000000000..93dbd648e575 --- /dev/null +++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/angle.hpp @@ -0,0 +1,215 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for elementwise evaluation of ANGLE(x) function. +//===---------------------------------------------------------------------===// + +#pragma once +#include +#include +#include +#include +#include + +#include + +#include "sycl_complex.hpp" +#include "vec_size_util.hpp" + +#include "kernels/dpctl_tensor_types.hpp" +#include "kernels/elementwise_functions/common.hpp" + +#include "utils/type_dispatch_building.hpp" +#include "utils/type_utils.hpp" + +namespace dpctl::tensor::kernels::angle +{ + +using dpctl::tensor::ssize_t; +namespace td_ns = dpctl::tensor::type_dispatch; + +using dpctl::tensor::type_utils::is_complex; + +template +struct AngleFunctor +{ + + // is function constant for given argT + using is_constant = typename std::false_type; + // constant value, if constant + // constexpr resT constant_value = resT{}; + // is function defined for sycl::vec + using supports_vec = typename std::false_type; + // do both argTy and resTy support sugroup store/load operation + using supports_sg_loadstore = typename std::negation< + std::disjunction, is_complex>>; + + resT operator()(const argT &in) const + { + using rT = typename argT::value_type; + + return exprm_ns::arg(exprm_ns::complex(in)); // arg(in); + } +}; + +template +using AngleContigFunctor = + elementwise_common::UnaryContigFunctor, + vec_sz, + n_vecs, + enable_sg_loadstore>; + +template +using AngleStridedFunctor = elementwise_common:: + UnaryStridedFunctor>; + +template +struct AngleOutputType +{ + using value_type = typename std::disjunction< + td_ns::TypeMapResultEntry, float>, + td_ns::TypeMapResultEntry, double>, + td_ns::DefaultResultEntry>::result_type; + + static constexpr bool is_defined = !std::is_same_v; +}; + +namespace hyperparam_detail +{ + +namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; + +using vsu_ns::ContigHyperparameterSetDefault; +using vsu_ns::UnaryContigHyperparameterSetEntry; + +template +struct AngleContigHyperparameterSet +{ + using value_type = + typename std::disjunction>; + + constexpr static auto vec_sz = value_type::vec_sz; + constexpr static auto n_vecs = value_type::n_vecs; +}; + +} // end of namespace hyperparam_detail + +template +class angle_contig_kernel; + +template +sycl::event angle_contig_impl(sycl::queue &exec_q, + std::size_t nelems, + const char *arg_p, + char *res_p, + const std::vector &depends = {}) +{ + using AngleHS = hyperparam_detail::AngleContigHyperparameterSet; + static constexpr std::uint8_t vec_sz = AngleHS::vec_sz; + static constexpr std::uint8_t n_vec = AngleHS::n_vecs; + + return elementwise_common::unary_contig_impl< + argTy, AngleOutputType, AngleContigFunctor, angle_contig_kernel, vec_sz, + n_vec>(exec_q, nelems, arg_p, res_p, depends); +} + +template +struct AngleContigFactory +{ + fnT get() + { + if constexpr (!AngleOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = angle_contig_impl; + return fn; + } + } +}; + +template +struct AngleTypeMapFactory +{ + /*! @brief get typeid for output type of std::arg(T x) */ + std::enable_if_t::value, int> get() + { + using rT = typename AngleOutputType::value_type; + return td_ns::GetTypeid{}.get(); + } +}; + +template +class angle_strided_kernel; + +template +sycl::event + angle_strided_impl(sycl::queue &exec_q, + std::size_t nelems, + int nd, + const ssize_t *shape_and_strides, + const char *arg_p, + ssize_t arg_offset, + char *res_p, + ssize_t res_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + return elementwise_common::unary_strided_impl< + argTy, AngleOutputType, AngleStridedFunctor, angle_strided_kernel>( + exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p, + res_offset, depends, additional_depends); +} + +template +struct AngleStridedFactory +{ + fnT get() + { + if constexpr (!AngleOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = angle_strided_impl; + return fn; + } + } +}; + +} // namespace dpctl::tensor::kernels::angle diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/asin.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/asin.hpp new file mode 100644 index 000000000000..d367c1243628 --- /dev/null +++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/asin.hpp @@ -0,0 +1,296 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for elementwise evaluation of ASIN(x) function. +//===---------------------------------------------------------------------===// + +#pragma once +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "sycl_complex.hpp" +#include "vec_size_util.hpp" + +#include "kernels/dpctl_tensor_types.hpp" +#include "kernels/elementwise_functions/common.hpp" + +#include "utils/type_dispatch_building.hpp" +#include "utils/type_utils.hpp" + +namespace dpctl::tensor::kernels::asin +{ + +using dpctl::tensor::ssize_t; +namespace td_ns = dpctl::tensor::type_dispatch; + +using dpctl::tensor::type_utils::is_complex; + +template +struct AsinFunctor +{ + + // is function constant for given argT + using is_constant = typename std::false_type; + // constant value, if constant + // constexpr resT constant_value = resT{}; + // is function defined for sycl::vec + using supports_vec = typename std::false_type; + // do both argTy and resTy support sugroup store/load operation + using supports_sg_loadstore = typename std::negation< + std::disjunction, is_complex>>; + + resT operator()(const argT &in) const + { + if constexpr (is_complex::value) { + using realT = typename argT::value_type; + + static constexpr realT q_nan = + std::numeric_limits::quiet_NaN(); + + /* + * asin(in) = I * conj( asinh(I * conj(in)) ) + * so we first calculate w = asinh(I * conj(in)) with + * x = real(I * conj(in)) = imag(in) + * y = imag(I * conj(in)) = real(in) + * and then return {imag(w), real(w)} which is asin(in) + */ + const realT x = std::imag(in); + const realT y = std::real(in); + + if (std::isnan(x)) { + /* asinh(NaN + I*+-Inf) = opt(+-)Inf + I*NaN */ + if (std::isinf(y)) { + const realT asinh_re = y; + const realT asinh_im = q_nan; + return resT{asinh_im, asinh_re}; + } + /* asinh(NaN + I*0) = NaN + I*0 */ + if (y == realT(0)) { + const realT asinh_re = q_nan; + const realT asinh_im = y; + return resT{asinh_im, asinh_re}; + } + /* All other cases involving NaN return NaN + I*NaN. */ + return resT{q_nan, q_nan}; + } + else if (std::isnan(y)) { + /* asinh(+-Inf + I*NaN) = +-Inf + I*NaN */ + if (std::isinf(x)) { + const realT asinh_re = x; + const realT asinh_im = q_nan; + return resT{asinh_im, asinh_re}; + } + /* All other cases involving NaN return NaN + I*NaN. */ + return resT{q_nan, q_nan}; + } + + /* + * For large x or y including asinh(+-Inf + I*+-Inf) + * asinh(in) = sign(x)*log(sign(x)*in) + O(1/in^2) as in -> + * infinity The above formula works for the imaginary part as well, + * because Im(asinh(in)) = sign(x)*atan2(sign(x)*y, fabs(x)) + + * O(y/in^3) as in -> infinity, uniformly in y + */ + static constexpr realT r_eps = + realT(1) / std::numeric_limits::epsilon(); + if (sycl::fabs(x) > r_eps || sycl::fabs(y) > r_eps) { + using sycl_complexT = exprm_ns::complex; + const sycl_complexT z{x, y}; + realT wx, wy; + if (!sycl::signbit(x)) { + const auto log_z = exprm_ns::log(z); + wx = log_z.real() + sycl::log(realT(2)); + wy = log_z.imag(); + } + else { + const auto log_mz = exprm_ns::log(-z); + wx = log_mz.real() + sycl::log(realT(2)); + wy = log_mz.imag(); + } + const realT asinh_re = sycl::copysign(wx, x); + const realT asinh_im = sycl::copysign(wy, y); + return resT{asinh_im, asinh_re}; + } + /* ordinary cases */ + return exprm_ns::asin( + exprm_ns::complex(in)); // sycl::asin(in); + } + else { + static_assert(std::is_floating_point_v || + std::is_same_v); + return sycl::asin(in); + } + } +}; + +template +using AsinContigFunctor = + elementwise_common::UnaryContigFunctor, + vec_sz, + n_vecs, + enable_sg_loadstore>; + +template +using AsinStridedFunctor = elementwise_common:: + UnaryStridedFunctor>; + +template +struct AsinOutputType +{ + using value_type = typename std::disjunction< + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry>, + td_ns::TypeMapResultEntry>, + td_ns::DefaultResultEntry>::result_type; + + static constexpr bool is_defined = !std::is_same_v; +}; + +namespace hyperparam_detail +{ + +namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; + +using vsu_ns::ContigHyperparameterSetDefault; +using vsu_ns::UnaryContigHyperparameterSetEntry; + +template +struct AsinContigHyperparameterSet +{ + using value_type = + typename std::disjunction>; + + constexpr static auto vec_sz = value_type::vec_sz; + constexpr static auto n_vecs = value_type::n_vecs; +}; + +} // end of namespace hyperparam_detail + +template +class asin_contig_kernel; + +template +sycl::event asin_contig_impl(sycl::queue &exec_q, + std::size_t nelems, + const char *arg_p, + char *res_p, + const std::vector &depends = {}) +{ + using AddHS = hyperparam_detail::AsinContigHyperparameterSet; + static constexpr std::uint8_t vec_sz = AddHS::vec_sz; + static constexpr std::uint8_t n_vec = AddHS::n_vecs; + + return elementwise_common::unary_contig_impl< + argTy, AsinOutputType, AsinContigFunctor, asin_contig_kernel, vec_sz, + n_vec>(exec_q, nelems, arg_p, res_p, depends); +} + +template +struct AsinContigFactory +{ + fnT get() + { + if constexpr (!AsinOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = asin_contig_impl; + return fn; + } + } +}; + +template +struct AsinTypeMapFactory +{ + /*! @brief get typeid for output type of sycl::asin(T x) */ + std::enable_if_t::value, int> get() + { + using rT = typename AsinOutputType::value_type; + return td_ns::GetTypeid{}.get(); + } +}; + +template +class asin_strided_kernel; + +template +sycl::event + asin_strided_impl(sycl::queue &exec_q, + std::size_t nelems, + int nd, + const ssize_t *shape_and_strides, + const char *arg_p, + ssize_t arg_offset, + char *res_p, + ssize_t res_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + return elementwise_common::unary_strided_impl< + argTy, AsinOutputType, AsinStridedFunctor, asin_strided_kernel>( + exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p, + res_offset, depends, additional_depends); +} + +template +struct AsinStridedFactory +{ + fnT get() + { + if constexpr (!AsinOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = asin_strided_impl; + return fn; + } + } +}; + +} // namespace dpctl::tensor::kernels::asin diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/asinh.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/asinh.hpp new file mode 100644 index 000000000000..472e04f7cbe8 --- /dev/null +++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/asinh.hpp @@ -0,0 +1,279 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for elementwise evaluation of ASINH(x) function. +//===---------------------------------------------------------------------===// + +#pragma once +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "sycl_complex.hpp" +#include "vec_size_util.hpp" + +#include "kernels/dpctl_tensor_types.hpp" +#include "kernels/elementwise_functions/common.hpp" + +#include "utils/type_dispatch_building.hpp" +#include "utils/type_utils.hpp" + +namespace dpctl::tensor::kernels::asinh +{ + +using dpctl::tensor::ssize_t; +namespace td_ns = dpctl::tensor::type_dispatch; + +using dpctl::tensor::type_utils::is_complex; + +template +struct AsinhFunctor +{ + + // is function constant for given argT + using is_constant = typename std::false_type; + // constant value, if constant + // constexpr resT constant_value = resT{}; + // is function defined for sycl::vec + using supports_vec = typename std::false_type; + // do both argTy and resTy support sugroup store/load operation + using supports_sg_loadstore = typename std::negation< + std::disjunction, is_complex>>; + + resT operator()(const argT &in) const + { + if constexpr (is_complex::value) { + using realT = typename argT::value_type; + + static constexpr realT q_nan = + std::numeric_limits::quiet_NaN(); + + const realT x = std::real(in); + const realT y = std::imag(in); + + if (std::isnan(x)) { + /* asinh(NaN + I*+-Inf) = opt(+-)Inf + I*NaN */ + if (std::isinf(y)) { + return resT{y, q_nan}; + } + /* asinh(NaN + I*0) = NaN + I*0 */ + if (y == realT(0)) { + return resT{q_nan, y}; + } + /* All other cases involving NaN return NaN + I*NaN. */ + return resT{q_nan, q_nan}; + } + + if (std::isnan(y)) { + /* asinh(+-Inf + I*NaN) = +-Inf + I*NaN */ + if (std::isinf(x)) { + return resT{x, q_nan}; + } + /* All other cases involving NaN return NaN + I*NaN. */ + return resT{q_nan, q_nan}; + } + + /* + * For large x or y including asinh(+-Inf + I*+-Inf) + * asinh(in) = sign(x)*log(sign(x)*in) + O(1/in^2) as in -> + * infinity The above formula works for the imaginary part as well, + * because Im(asinh(in)) = sign(x)*atan2(sign(x)*y, fabs(x)) + + * O(y/in^3) as in -> infinity, uniformly in y + */ + static constexpr realT r_eps = + realT(1) / std::numeric_limits::epsilon(); + + if (sycl::fabs(x) > r_eps || sycl::fabs(y) > r_eps) { + using sycl_complexT = exprm_ns::complex; + sycl_complexT log_in = (sycl::signbit(x)) + ? exprm_ns::log(sycl_complexT(-in)) + : exprm_ns::log(sycl_complexT(in)); + realT wx = log_in.real() + sycl::log(realT(2)); + realT wy = log_in.imag(); + + const realT res_re = sycl::copysign(wx, x); + const realT res_im = sycl::copysign(wy, y); + return resT{res_re, res_im}; + } + + /* ordinary cases */ + return exprm_ns::asinh(exprm_ns::complex(in)); // asinh(in); + } + else { + static_assert(std::is_floating_point_v || + std::is_same_v); + return sycl::asinh(in); + } + } +}; + +template +using AsinhContigFunctor = + elementwise_common::UnaryContigFunctor, + vec_sz, + n_vecs, + enable_sg_loadstore>; + +template +using AsinhStridedFunctor = elementwise_common:: + UnaryStridedFunctor>; + +template +struct AsinhOutputType +{ + using value_type = typename std::disjunction< + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry>, + td_ns::TypeMapResultEntry>, + td_ns::DefaultResultEntry>::result_type; + + static constexpr bool is_defined = !std::is_same_v; +}; + +namespace hyperparam_detail +{ + +namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; + +using vsu_ns::ContigHyperparameterSetDefault; +using vsu_ns::UnaryContigHyperparameterSetEntry; + +template +struct AsinhContigHyperparameterSet +{ + using value_type = + typename std::disjunction>; + + constexpr static auto vec_sz = value_type::vec_sz; + constexpr static auto n_vecs = value_type::n_vecs; +}; + +} // end of namespace hyperparam_detail + +template +class asinh_contig_kernel; + +template +sycl::event asinh_contig_impl(sycl::queue &exec_q, + std::size_t nelems, + const char *arg_p, + char *res_p, + const std::vector &depends = {}) +{ + using AsinhHS = hyperparam_detail::AsinhContigHyperparameterSet; + static constexpr std::uint8_t vec_sz = AsinhHS::vec_sz; + static constexpr std::uint8_t n_vec = AsinhHS::n_vecs; + + return elementwise_common::unary_contig_impl< + argTy, AsinhOutputType, AsinhContigFunctor, asinh_contig_kernel, vec_sz, + n_vec>(exec_q, nelems, arg_p, res_p, depends); +} + +template +struct AsinhContigFactory +{ + fnT get() + { + if constexpr (!AsinhOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = asinh_contig_impl; + return fn; + } + } +}; + +template +struct AsinhTypeMapFactory +{ + /*! @brief get typeid for output type of sycl::asinh(T x) */ + std::enable_if_t::value, int> get() + { + using rT = typename AsinhOutputType::value_type; + return td_ns::GetTypeid{}.get(); + } +}; + +template +class asinh_strided_kernel; + +template +sycl::event + asinh_strided_impl(sycl::queue &exec_q, + std::size_t nelems, + int nd, + const ssize_t *shape_and_strides, + const char *arg_p, + ssize_t arg_offset, + char *res_p, + ssize_t res_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + return elementwise_common::unary_strided_impl< + argTy, AsinhOutputType, AsinhStridedFunctor, asinh_strided_kernel>( + exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p, + res_offset, depends, additional_depends); +} + +template +struct AsinhStridedFactory +{ + fnT get() + { + if constexpr (!AsinhOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = asinh_strided_impl; + return fn; + } + } +}; + +} // namespace dpctl::tensor::kernels::asinh diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/atan.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/atan.hpp new file mode 100644 index 000000000000..ab07a3fce3e0 --- /dev/null +++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/atan.hpp @@ -0,0 +1,288 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for elementwise evaluation of ATAN(x) function. +//===---------------------------------------------------------------------===// + +#pragma once +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "sycl_complex.hpp" +#include "vec_size_util.hpp" + +#include "kernels/dpctl_tensor_types.hpp" +#include "kernels/elementwise_functions/common.hpp" + +#include "utils/type_dispatch_building.hpp" +#include "utils/type_utils.hpp" + +namespace dpctl::tensor::kernels::atan +{ + +using dpctl::tensor::ssize_t; +namespace td_ns = dpctl::tensor::type_dispatch; + +using dpctl::tensor::kernels::vec_size_utils::ContigHyperparameterSetDefault; +using dpctl::tensor::kernels::vec_size_utils::UnaryContigHyperparameterSetEntry; + +using dpctl::tensor::type_utils::is_complex; + +template +struct AtanFunctor +{ + + // is function constant for given argT + using is_constant = typename std::false_type; + // constant value, if constant + // constexpr resT constant_value = resT{}; + // is function defined for sycl::vec + using supports_vec = typename std::false_type; + // do both argTy and resTy support sugroup store/load operation + using supports_sg_loadstore = typename std::negation< + std::disjunction, is_complex>>; + + resT operator()(const argT &in) const + { + if constexpr (is_complex::value) { + using realT = typename argT::value_type; + + static constexpr realT q_nan = + std::numeric_limits::quiet_NaN(); + /* + * atan(in) = I * conj( atanh(I * conj(in)) ) + * so we first calculate w = atanh(I * conj(in)) with + * x = real(I * conj(in)) = imag(in) + * y = imag(I * conj(in)) = real(in) + * and then return {imag(w), real(w)} which is atan(in) + */ + const realT x = std::imag(in); + const realT y = std::real(in); + if (std::isnan(x)) { + /* atanh(NaN + I*+-Inf) = sign(NaN)*0 + I*+-Pi/2 */ + if (std::isinf(y)) { + const realT pi_half = sycl::atan(realT(1)) * 2; + + const realT atanh_re = sycl::copysign(realT(0), x); + const realT atanh_im = sycl::copysign(pi_half, y); + return resT{atanh_im, atanh_re}; + } + /* + * All other cases involving NaN return NaN + I*NaN. + */ + return resT{q_nan, q_nan}; + } + else if (std::isnan(y)) { + /* atanh(+-Inf + I*NaN) = +-0 + I*NaN */ + if (std::isinf(x)) { + const realT atanh_re = sycl::copysign(realT(0), x); + const realT atanh_im = q_nan; + return resT{atanh_im, atanh_re}; + } + /* atanh(+-0 + I*NaN) = +-0 + I*NaN */ + if (x == realT(0)) { + return resT{q_nan, x}; + } + /* + * All other cases involving NaN return NaN + I*NaN. + */ + return resT{q_nan, q_nan}; + } + + /* + * For large x or y including + * atanh(+-Inf + I*+-Inf) = 0 + I*+-PI/2 + * The sign of pi/2 depends on the sign of imaginary part of the + * input. + */ + static constexpr realT r_eps = + realT(1) / std::numeric_limits::epsilon(); + if (sycl::fabs(x) > r_eps || sycl::fabs(y) > r_eps) { + const realT pi_half = sycl::atan(realT(1)) * 2; + + const realT atanh_re = realT(0); + const realT atanh_im = sycl::copysign(pi_half, y); + return resT{atanh_im, atanh_re}; + } + /* ordinary cases */ + return exprm_ns::atan(exprm_ns::complex(in)); // atan(in); + } + else { + static_assert(std::is_floating_point_v || + std::is_same_v); + return sycl::atan(in); + } + } +}; + +template +using AtanContigFunctor = + elementwise_common::UnaryContigFunctor, + vec_sz, + n_vecs, + enable_sg_loadstore>; + +template +using AtanStridedFunctor = elementwise_common:: + UnaryStridedFunctor>; + +template +struct AtanOutputType +{ + using value_type = typename std::disjunction< + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry>, + td_ns::TypeMapResultEntry>, + td_ns::DefaultResultEntry>::result_type; + + static constexpr bool is_defined = !std::is_same_v; +}; + +namespace hyperparam_detail +{ + +namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; + +using vsu_ns::ContigHyperparameterSetDefault; +using vsu_ns::UnaryContigHyperparameterSetEntry; + +template +struct AtanContigHyperparameterSet +{ + using value_type = + typename std::disjunction>; + + constexpr static auto vec_sz = value_type::vec_sz; + constexpr static auto n_vecs = value_type::n_vecs; +}; + +} // end of namespace hyperparam_detail + +template +class atan_contig_kernel; + +template +sycl::event atan_contig_impl(sycl::queue &exec_q, + std::size_t nelems, + const char *arg_p, + char *res_p, + const std::vector &depends = {}) +{ + using AtanHS = hyperparam_detail::AtanContigHyperparameterSet; + static constexpr std::uint8_t vec_sz = AtanHS::vec_sz; + static constexpr std::uint8_t n_vec = AtanHS::n_vecs; + + return elementwise_common::unary_contig_impl< + argTy, AtanOutputType, AtanContigFunctor, atan_contig_kernel, vec_sz, + n_vec>(exec_q, nelems, arg_p, res_p, depends); +} + +template +struct AtanContigFactory +{ + fnT get() + { + if constexpr (!AtanOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = atan_contig_impl; + return fn; + } + } +}; + +template +struct AtanTypeMapFactory +{ + /*! @brief get typeid for output type of sycl::atan(T x) */ + std::enable_if_t::value, int> get() + { + using rT = typename AtanOutputType::value_type; + return td_ns::GetTypeid{}.get(); + } +}; + +template +class atan_strided_kernel; + +template +sycl::event + atan_strided_impl(sycl::queue &exec_q, + std::size_t nelems, + int nd, + const ssize_t *shape_and_strides, + const char *arg_p, + ssize_t arg_offset, + char *res_p, + ssize_t res_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + return elementwise_common::unary_strided_impl< + argTy, AtanOutputType, AtanStridedFunctor, atan_strided_kernel>( + exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p, + res_offset, depends, additional_depends); +} + +template +struct AtanStridedFactory +{ + fnT get() + { + if constexpr (!AtanOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = atan_strided_impl; + return fn; + } + } +}; + +} // namespace dpctl::tensor::kernels::atan diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/atan2.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/atan2.hpp new file mode 100644 index 000000000000..220722d5b596 --- /dev/null +++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/atan2.hpp @@ -0,0 +1,233 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for elementwise evaluation of ATAN2(x1, x2) +/// function. +//===---------------------------------------------------------------------===// + +#pragma once +#include +#include +#include +#include +#include + +#include + +#include "vec_size_util.hpp" + +#include "kernels/dpctl_tensor_types.hpp" +#include "kernels/elementwise_functions/common.hpp" + +#include "utils/type_dispatch_building.hpp" + +namespace dpctl::tensor::kernels::atan2 +{ + +using dpctl::tensor::ssize_t; +namespace td_ns = dpctl::tensor::type_dispatch; + +template +struct Atan2Functor +{ + + using supports_sg_loadstore = std::true_type; + using supports_vec = std::false_type; + + resT operator()(const argT1 &in1, const argT2 &in2) const + { + if (std::isinf(in2) && !sycl::signbit(in2)) { + if (std::isfinite(in1)) { + return sycl::copysign(resT(0), in1); + } + } + return sycl::atan2(in1, in2); + } +}; + +template +using Atan2ContigFunctor = + elementwise_common::BinaryContigFunctor, + vec_sz, + n_vecs, + enable_sg_loadstore>; + +template +using Atan2StridedFunctor = + elementwise_common::BinaryStridedFunctor>; + +template +struct Atan2OutputType +{ + using value_type = typename std::disjunction< + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::DefaultResultEntry>::result_type; + + static constexpr bool is_defined = !std::is_same_v; +}; + +namespace hyperparam_detail +{ + +namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; + +using vsu_ns::BinaryContigHyperparameterSetEntry; +using vsu_ns::ContigHyperparameterSetDefault; + +template +struct Atan2ContigHyperparameterSet +{ + using value_type = + typename std::disjunction>; + + constexpr static auto vec_sz = value_type::vec_sz; + constexpr static auto n_vecs = value_type::n_vecs; +}; + +} // end of namespace hyperparam_detail + +template +class atan2_contig_kernel; + +template +sycl::event atan2_contig_impl(sycl::queue &exec_q, + std::size_t nelems, + const char *arg1_p, + ssize_t arg1_offset, + const char *arg2_p, + ssize_t arg2_offset, + char *res_p, + ssize_t res_offset, + const std::vector &depends = {}) +{ + using Atan2HS = + hyperparam_detail::Atan2ContigHyperparameterSet; + static constexpr std::uint8_t vec_sz = Atan2HS::vec_sz; + static constexpr std::uint8_t n_vecs = Atan2HS::n_vecs; + + return elementwise_common::binary_contig_impl< + argTy1, argTy2, Atan2OutputType, Atan2ContigFunctor, + atan2_contig_kernel, vec_sz, n_vecs>(exec_q, nelems, arg1_p, + arg1_offset, arg2_p, arg2_offset, + res_p, res_offset, depends); +} + +template +struct Atan2ContigFactory +{ + fnT get() + { + if constexpr (!Atan2OutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = atan2_contig_impl; + return fn; + } + } +}; + +template +struct Atan2TypeMapFactory +{ + /*! @brief get typeid for output type of sycl::atan2(T1 x, T2 y) */ + std::enable_if_t::value, int> get() + { + using rT = typename Atan2OutputType::value_type; + return td_ns::GetTypeid{}.get(); + } +}; + +template +class atan2_strided_kernel; + +template +sycl::event + atan2_strided_impl(sycl::queue &exec_q, + std::size_t nelems, + int nd, + const ssize_t *shape_and_strides, + const char *arg1_p, + ssize_t arg1_offset, + const char *arg2_p, + ssize_t arg2_offset, + char *res_p, + ssize_t res_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + return elementwise_common::binary_strided_impl< + argTy1, argTy2, Atan2OutputType, Atan2StridedFunctor, + atan2_strided_kernel>(exec_q, nelems, nd, shape_and_strides, arg1_p, + arg1_offset, arg2_p, arg2_offset, res_p, + res_offset, depends, additional_depends); +} + +template +struct Atan2StridedFactory +{ + fnT get() + { + if constexpr (!Atan2OutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = atan2_strided_impl; + return fn; + } + } +}; + +} // namespace dpctl::tensor::kernels::atan2 diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/atanh.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/atanh.hpp new file mode 100644 index 000000000000..32f5384f4ad8 --- /dev/null +++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/atanh.hpp @@ -0,0 +1,280 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for elementwise evaluation of ATANH(x) function. +//===---------------------------------------------------------------------===// + +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "sycl_complex.hpp" +#include "vec_size_util.hpp" + +#include "kernels/dpctl_tensor_types.hpp" +#include "kernels/elementwise_functions/common.hpp" + +#include "utils/type_dispatch_building.hpp" +#include "utils/type_utils.hpp" + +namespace dpctl::tensor::kernels::atanh +{ + +using dpctl::tensor::ssize_t; +namespace td_ns = dpctl::tensor::type_dispatch; + +using dpctl::tensor::type_utils::is_complex; + +template +struct AtanhFunctor +{ + + // is function constant for given argT + using is_constant = typename std::false_type; + // constant value, if constant + // constexpr resT constant_value = resT{}; + // is function defined for sycl::vec + using supports_vec = typename std::false_type; + // do both argTy and resTy support sugroup store/load operation + using supports_sg_loadstore = typename std::negation< + std::disjunction, is_complex>>; + + resT operator()(const argT &in) const + { + if constexpr (is_complex::value) { + using realT = typename argT::value_type; + static constexpr realT q_nan = + std::numeric_limits::quiet_NaN(); + + const realT x = std::real(in); + const realT y = std::imag(in); + + if (std::isnan(x)) { + /* atanh(NaN + I*+-Inf) = sign(NaN)0 + I*+-PI/2 */ + if (std::isinf(y)) { + const realT pi_half = sycl::atan(realT(1)) * 2; + + const realT res_re = sycl::copysign(realT(0), x); + const realT res_im = sycl::copysign(pi_half, y); + return resT{res_re, res_im}; + } + /* + * All other cases involving NaN return NaN + I*NaN. + */ + return resT{q_nan, q_nan}; + } + else if (std::isnan(y)) { + /* atanh(+-Inf + I*NaN) = +-0 + I*NaN */ + if (std::isinf(x)) { + const realT res_re = sycl::copysign(realT(0), x); + return resT{res_re, q_nan}; + } + /* atanh(+-0 + I*NaN) = +-0 + I*NaN */ + if (x == realT(0)) { + return resT{x, q_nan}; + } + /* + * All other cases involving NaN return NaN + I*NaN. + */ + return resT{q_nan, q_nan}; + } + + /* + * For large x or y including + * atanh(+-Inf + I*+-Inf) = 0 + I*+-PI/2 + * The sign of PI/2 depends on the sign of imaginary part of the + * input. + */ + const realT RECIP_EPSILON = + realT(1) / std::numeric_limits::epsilon(); + if (sycl::fabs(x) > RECIP_EPSILON || + sycl::fabs(y) > RECIP_EPSILON) { + const realT pi_half = sycl::atan(realT(1)) * 2; + + const realT res_re = realT(0); + const realT res_im = sycl::copysign(pi_half, y); + return resT{res_re, res_im}; + } + /* ordinary cases */ + return exprm_ns::atanh(exprm_ns::complex(in)); // atanh(in); + } + else { + static_assert(std::is_floating_point_v || + std::is_same_v); + return sycl::atanh(in); + } + } +}; + +template +using AtanhContigFunctor = + elementwise_common::UnaryContigFunctor, + vec_sz, + n_vecs, + enable_sg_loadstore>; + +template +using AtanhStridedFunctor = elementwise_common:: + UnaryStridedFunctor>; + +template +struct AtanhOutputType +{ + using value_type = typename std::disjunction< + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry>, + td_ns::TypeMapResultEntry>, + td_ns::DefaultResultEntry>::result_type; + + static constexpr bool is_defined = !std::is_same_v; +}; + +namespace hyperparam_detail +{ + +namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; + +using vsu_ns::ContigHyperparameterSetDefault; +using vsu_ns::UnaryContigHyperparameterSetEntry; + +template +struct AtanhContigHyperparameterSet +{ + using value_type = + typename std::disjunction>; + + constexpr static auto vec_sz = value_type::vec_sz; + constexpr static auto n_vecs = value_type::n_vecs; +}; + +} // end of namespace hyperparam_detail + +template +class atanh_contig_kernel; + +template +sycl::event atanh_contig_impl(sycl::queue &exec_q, + std::size_t nelems, + const char *arg_p, + char *res_p, + const std::vector &depends = {}) +{ + using AtanhHS = hyperparam_detail::AtanhContigHyperparameterSet; + static constexpr std::uint8_t vec_sz = AtanhHS::vec_sz; + static constexpr std::uint8_t n_vec = AtanhHS::n_vecs; + + return elementwise_common::unary_contig_impl< + argTy, AtanhOutputType, AtanhContigFunctor, atanh_contig_kernel, vec_sz, + n_vec>(exec_q, nelems, arg_p, res_p, depends); +} + +template +struct AtanhContigFactory +{ + fnT get() + { + if constexpr (!AtanhOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = atanh_contig_impl; + return fn; + } + } +}; + +template +struct AtanhTypeMapFactory +{ + /*! @brief get typeid for output type of sycl::atanh(T x) */ + std::enable_if_t::value, int> get() + { + using rT = typename AtanhOutputType::value_type; + return td_ns::GetTypeid{}.get(); + } +}; + +template +class atanh_strided_kernel; + +template +sycl::event + atanh_strided_impl(sycl::queue &exec_q, + std::size_t nelems, + int nd, + const ssize_t *shape_and_strides, + const char *arg_p, + ssize_t arg_offset, + char *res_p, + ssize_t res_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + return elementwise_common::unary_strided_impl< + argTy, AtanhOutputType, AtanhStridedFunctor, atanh_strided_kernel>( + exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p, + res_offset, depends, additional_depends); +} + +template +struct AtanhStridedFactory +{ + fnT get() + { + if constexpr (!AtanhOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = atanh_strided_impl; + return fn; + } + } +}; + +} // namespace dpctl::tensor::kernels::atanh diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/bitwise_and.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/bitwise_and.hpp new file mode 100644 index 000000000000..dae2e62a76b2 --- /dev/null +++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/bitwise_and.hpp @@ -0,0 +1,461 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for elementwise bitwise_and(ar1, ar2) operation. +//===---------------------------------------------------------------------===// + +#pragma once +#include +#include +#include +#include + +#include + +#include "vec_size_util.hpp" + +#include "kernels/dpctl_tensor_types.hpp" +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/common_inplace.hpp" + +#include "utils/type_dispatch_building.hpp" +#include "utils/type_utils.hpp" + +namespace dpctl::tensor::kernels::bitwise_and +{ + +using dpctl::tensor::ssize_t; +namespace td_ns = dpctl::tensor::type_dispatch; + +template +struct BitwiseAndFunctor +{ + static_assert(std::is_same_v); + static_assert(std::is_same_v); + + using supports_sg_loadstore = typename std::true_type; + using supports_vec = typename std::true_type; + + resT operator()(const argT1 &in1, const argT2 &in2) const + { + if constexpr (std::is_same_v) { + return in1 && in2; + } + else { + return (in1 & in2); + } + } + + template + sycl::vec + operator()(const sycl::vec &in1, + const sycl::vec &in2) const + { + + if constexpr (std::is_same_v) { + using dpctl::tensor::type_utils::vec_cast; + + auto tmp = (in1 && in2); + return vec_cast( + tmp); + } + else { + return (in1 & in2); + } + } +}; + +template +using BitwiseAndContigFunctor = elementwise_common::BinaryContigFunctor< + argT1, + argT2, + resT, + BitwiseAndFunctor, + vec_sz, + n_vecs, + enable_sg_loadstore>; + +template +using BitwiseAndStridedFunctor = elementwise_common::BinaryStridedFunctor< + argT1, + argT2, + resT, + IndexerT, + BitwiseAndFunctor>; + +template +struct BitwiseAndOutputType +{ + using value_type = typename std::disjunction< + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::DefaultResultEntry>::result_type; + + static constexpr bool is_defined = !std::is_same_v; +}; + +namespace hyperparam_detail +{ + +namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; + +using vsu_ns::BinaryContigHyperparameterSetEntry; +using vsu_ns::ContigHyperparameterSetDefault; + +template +struct BitwiseAndContigHyperparameterSet +{ + using value_type = + typename std::disjunction>; + + constexpr static auto vec_sz = value_type::vec_sz; + constexpr static auto n_vecs = value_type::n_vecs; +}; +} // end of namespace hyperparam_detail + +template +class bitwise_and_contig_kernel; + +template +sycl::event + bitwise_and_contig_impl(sycl::queue &exec_q, + std::size_t nelems, + const char *arg1_p, + ssize_t arg1_offset, + const char *arg2_p, + ssize_t arg2_offset, + char *res_p, + ssize_t res_offset, + const std::vector &depends = {}) +{ + using BitwiseAndHS = + hyperparam_detail::BitwiseAndContigHyperparameterSet; + static constexpr std::uint8_t vec_sz = BitwiseAndHS::vec_sz; + static constexpr std::uint8_t n_vec = BitwiseAndHS::n_vecs; + + return elementwise_common::binary_contig_impl< + argTy1, argTy2, BitwiseAndOutputType, BitwiseAndContigFunctor, + bitwise_and_contig_kernel, vec_sz, n_vec>( + exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p, + res_offset, depends); +} + +template +struct BitwiseAndContigFactory +{ + fnT get() + { + if constexpr (!BitwiseAndOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = bitwise_and_contig_impl; + return fn; + } + } +}; + +template +struct BitwiseAndTypeMapFactory +{ + /*! @brief get typeid for output type of operator()>(x, y), always bool + */ + std::enable_if_t::value, int> get() + { + using rT = typename BitwiseAndOutputType::value_type; + return td_ns::GetTypeid{}.get(); + } +}; + +template +class bitwise_and_strided_kernel; + +template +sycl::event + bitwise_and_strided_impl(sycl::queue &exec_q, + std::size_t nelems, + int nd, + const ssize_t *shape_and_strides, + const char *arg1_p, + ssize_t arg1_offset, + const char *arg2_p, + ssize_t arg2_offset, + char *res_p, + ssize_t res_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + return elementwise_common::binary_strided_impl< + argTy1, argTy2, BitwiseAndOutputType, BitwiseAndStridedFunctor, + bitwise_and_strided_kernel>( + exec_q, nelems, nd, shape_and_strides, arg1_p, arg1_offset, arg2_p, + arg2_offset, res_p, res_offset, depends, additional_depends); +} + +template +struct BitwiseAndStridedFactory +{ + fnT get() + { + if constexpr (!BitwiseAndOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = bitwise_and_strided_impl; + return fn; + } + } +}; + +template +struct BitwiseAndInplaceFunctor +{ + using supports_sg_loadstore = typename std::true_type; + using supports_vec = typename std::true_type; + + void operator()(resT &res, const argT &in) const + { + if constexpr (std::is_same_v) { + res = res && in; + } + else { + res &= in; + } + } + + template + void operator()(sycl::vec &res, + const sycl::vec &in) const + { + + if constexpr (std::is_same_v) { + using dpctl::tensor::type_utils::vec_cast; + + auto tmp = (res && in); + res = vec_cast( + tmp); + } + else { + res &= in; + } + } +}; + +template +using BitwiseAndInplaceContigFunctor = + elementwise_common::BinaryInplaceContigFunctor< + argT, + resT, + BitwiseAndInplaceFunctor, + vec_sz, + n_vecs, + enable_sg_loadstore>; + +template +using BitwiseAndInplaceStridedFunctor = + elementwise_common::BinaryInplaceStridedFunctor< + argT, + resT, + IndexerT, + BitwiseAndInplaceFunctor>; + +template +class bitwise_and_inplace_contig_kernel; + +/* @brief Types supported by in-place bitwise AND */ +template +struct BitwiseAndInplaceTypePairSupport +{ + /* value if true a kernel for must be instantiated */ + static constexpr bool is_defined = std::disjunction< + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + // fall-through + td_ns::NotDefinedEntry>::is_defined; +}; + +template +struct BitwiseAndInplaceTypeMapFactory +{ + /*! @brief get typeid for output type of x &= y */ + std::enable_if_t::value, int> get() + { + if constexpr (BitwiseAndInplaceTypePairSupport::is_defined) { + return td_ns::GetTypeid{}.get(); + } + else { + return td_ns::GetTypeid{}.get(); + } + } +}; + +template +sycl::event bitwise_and_inplace_contig_impl( + sycl::queue &exec_q, + std::size_t nelems, + const char *arg_p, + ssize_t arg_offset, + char *res_p, + ssize_t res_offset, + const std::vector &depends = {}) +{ + using BitwiseAndHS = + hyperparam_detail::BitwiseAndContigHyperparameterSet; + static constexpr std::uint8_t vec_sz = BitwiseAndHS::vec_sz; + static constexpr std::uint8_t n_vecs = BitwiseAndHS::n_vecs; + + return elementwise_common::binary_inplace_contig_impl< + argTy, resTy, BitwiseAndInplaceContigFunctor, + bitwise_and_inplace_contig_kernel, vec_sz, n_vecs>( + exec_q, nelems, arg_p, arg_offset, res_p, res_offset, depends); +} + +template +struct BitwiseAndInplaceContigFactory +{ + fnT get() + { + if constexpr (!BitwiseAndInplaceTypePairSupport::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = bitwise_and_inplace_contig_impl; + return fn; + } + } +}; + +template +class bitwise_and_inplace_strided_kernel; + +template +sycl::event bitwise_and_inplace_strided_impl( + sycl::queue &exec_q, + std::size_t nelems, + int nd, + const ssize_t *shape_and_strides, + const char *arg_p, + ssize_t arg_offset, + char *res_p, + ssize_t res_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + return elementwise_common::binary_inplace_strided_impl< + argTy, resTy, BitwiseAndInplaceStridedFunctor, + bitwise_and_inplace_strided_kernel>( + exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p, + res_offset, depends, additional_depends); +} + +template +struct BitwiseAndInplaceStridedFactory +{ + fnT get() + { + if constexpr (!BitwiseAndInplaceTypePairSupport::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = bitwise_and_inplace_strided_impl; + return fn; + } + } +}; + +} // namespace dpctl::tensor::kernels::bitwise_and diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/bitwise_invert.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/bitwise_invert.hpp new file mode 100644 index 000000000000..96da6b9627ab --- /dev/null +++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/bitwise_invert.hpp @@ -0,0 +1,231 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for elementwise evaluation of bitwise_invert(x) +/// function that inverts bits of binary representation of the argument. +//===---------------------------------------------------------------------===// + +#pragma once +#include +#include +#include +#include + +#include + +#include "vec_size_util.hpp" + +#include "kernels/dpctl_tensor_types.hpp" +#include "kernels/elementwise_functions/common.hpp" + +#include "utils/type_dispatch_building.hpp" + +namespace dpctl::tensor::kernels::bitwise_invert +{ + +using dpctl::tensor::ssize_t; +namespace td_ns = dpctl::tensor::type_dispatch; + +template +struct BitwiseInvertFunctor +{ + static_assert(std::is_same_v); + static_assert(std::is_integral_v || std::is_same_v); + + using is_constant = typename std::false_type; + // constexpr resT constant_value = resT{}; + using supports_vec = typename std::negation>; + using supports_sg_loadstore = typename std::true_type; + + resT operator()(const argT &in) const + { + if constexpr (std::is_same_v) { + return !in; + } + else { + return ~in; + } + } + + template + sycl::vec operator()(const sycl::vec &in) const + { + return ~in; + } +}; + +template +using BitwiseInvertContigFunctor = + elementwise_common::UnaryContigFunctor, + vec_sz, + n_vecs, + enable_sg_loadstore>; + +template +using BitwiseInvertStridedFunctor = + elementwise_common::UnaryStridedFunctor>; + +template +struct BitwiseInvertOutputType +{ + using value_type = typename std::disjunction< + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::DefaultResultEntry>::result_type; + + static constexpr bool is_defined = !std::is_same_v; +}; + +namespace hyperparam_detail +{ + +namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; + +using vsu_ns::ContigHyperparameterSetDefault; +using vsu_ns::UnaryContigHyperparameterSetEntry; + +template +struct BitwiseInvertContigHyperparameterSet +{ + using value_type = + typename std::disjunction>; + + constexpr static auto vec_sz = value_type::vec_sz; + constexpr static auto n_vecs = value_type::n_vecs; +}; + +} // end of namespace hyperparam_detail + +template +class bitwise_invert_contig_kernel; + +template +sycl::event + bitwise_invert_contig_impl(sycl::queue &exec_q, + std::size_t nelems, + const char *arg_p, + char *res_p, + const std::vector &depends = {}) +{ + using BitwiseInvertHS = + hyperparam_detail::BitwiseInvertContigHyperparameterSet; + static constexpr std::uint8_t vec_sz = BitwiseInvertHS::vec_sz; + static constexpr std::uint8_t n_vec = BitwiseInvertHS::n_vecs; + + return elementwise_common::unary_contig_impl< + argTy, BitwiseInvertOutputType, BitwiseInvertContigFunctor, + bitwise_invert_contig_kernel, vec_sz, n_vec>(exec_q, nelems, arg_p, + res_p, depends); +} + +template +struct BitwiseInvertContigFactory +{ + fnT get() + { + if constexpr (!BitwiseInvertOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = bitwise_invert_contig_impl; + return fn; + } + } +}; + +template +struct BitwiseInvertTypeMapFactory +{ + /*! @brief get typeid for output type of sycl::logical_not(T x) */ + std::enable_if_t::value, int> get() + { + using rT = typename BitwiseInvertOutputType::value_type; + return td_ns::GetTypeid{}.get(); + } +}; + +template +class bitwise_invert_strided_kernel; + +template +sycl::event bitwise_invert_strided_impl( + sycl::queue &exec_q, + std::size_t nelems, + int nd, + const ssize_t *shape_and_strides, + const char *arg_p, + ssize_t arg_offset, + char *res_p, + ssize_t res_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + return elementwise_common::unary_strided_impl< + argTy, BitwiseInvertOutputType, BitwiseInvertStridedFunctor, + bitwise_invert_strided_kernel>(exec_q, nelems, nd, shape_and_strides, + arg_p, arg_offset, res_p, res_offset, + depends, additional_depends); +} + +template +struct BitwiseInvertStridedFactory +{ + fnT get() + { + if constexpr (!BitwiseInvertOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = bitwise_invert_strided_impl; + return fn; + } + } +}; + +} // namespace dpctl::tensor::kernels::bitwise_invert diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/bitwise_left_shift.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/bitwise_left_shift.hpp new file mode 100644 index 000000000000..59279a803ed8 --- /dev/null +++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/bitwise_left_shift.hpp @@ -0,0 +1,481 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for elementwise bitwise_left_shift(ar1, ar2) +/// operation. +//===---------------------------------------------------------------------===// + +#pragma once +#include +#include +#include +#include + +#include + +#include "vec_size_util.hpp" + +#include "kernels/dpctl_tensor_types.hpp" +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/common_inplace.hpp" + +#include "utils/type_dispatch_building.hpp" + +namespace dpctl::tensor::kernels::bitwise_left_shift +{ + +using dpctl::tensor::ssize_t; +namespace td_ns = dpctl::tensor::type_dispatch; + +template +struct BitwiseLeftShiftFunctor +{ + static_assert(std::is_integral_v); + static_assert(std::is_integral_v); + static_assert(!std::is_same_v); + static_assert(!std::is_same_v); + + using supports_sg_loadstore = typename std::true_type; + using supports_vec = typename std::true_type; + + resT operator()(const argT1 &in1, const argT2 &in2) const + { + return impl(in1, in2); + } + + template + sycl::vec + operator()(const sycl::vec &in1, + const sycl::vec &in2) const + { + sycl::vec res; +#pragma unroll + for (int i = 0; i < vec_sz; ++i) { + res[i] = impl(in1[i], in2[i]); + } + return res; + } + +private: + resT impl(const argT1 &in1, const argT2 &in2) const + { + static constexpr argT2 in1_bitsize = + static_cast(sizeof(argT1) * 8); + static constexpr resT zero = resT(0); + + // bitshift op with second operand negative, or >= bitwidth(argT1) is UB + // array API spec mandates 0 + if constexpr (std::is_unsigned_v) { + return (in2 < in1_bitsize) ? (in1 << in2) : zero; + } + else { + return (in2 < argT2(0)) + ? zero + : ((in2 < in1_bitsize) ? (in1 << in2) : zero); + } + } +}; + +template +using BitwiseLeftShiftContigFunctor = elementwise_common::BinaryContigFunctor< + argT1, + argT2, + resT, + BitwiseLeftShiftFunctor, + vec_sz, + n_vecs, + enable_sg_loadstore>; + +template +using BitwiseLeftShiftStridedFunctor = elementwise_common::BinaryStridedFunctor< + argT1, + argT2, + resT, + IndexerT, + BitwiseLeftShiftFunctor>; + +template +struct BitwiseLeftShiftOutputType +{ + using ResT = T1; + using value_type = typename std::disjunction< + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::DefaultResultEntry>::result_type; + + static constexpr bool is_defined = !std::is_same_v; +}; + +namespace hyperparam_detail +{ + +namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; + +using vsu_ns::BinaryContigHyperparameterSetEntry; +using vsu_ns::ContigHyperparameterSetDefault; + +template +struct BitwiseLeftShiftContigHyperparameterSet +{ + using value_type = + typename std::disjunction>; + + constexpr static auto vec_sz = value_type::vec_sz; + constexpr static auto n_vecs = value_type::n_vecs; +}; + +} // end of namespace hyperparam_detail + +template +class bitwise_left_shift_contig_kernel; + +template +sycl::event + bitwise_left_shift_contig_impl(sycl::queue &exec_q, + std::size_t nelems, + const char *arg1_p, + ssize_t arg1_offset, + const char *arg2_p, + ssize_t arg2_offset, + char *res_p, + ssize_t res_offset, + const std::vector &depends = {}) +{ + using BitwiseLSHS = + hyperparam_detail::BitwiseLeftShiftContigHyperparameterSet; + static constexpr std::uint8_t vec_sz = BitwiseLSHS::vec_sz; + static constexpr std::uint8_t n_vecs = BitwiseLSHS::n_vecs; + + return elementwise_common::binary_contig_impl< + argTy1, argTy2, BitwiseLeftShiftOutputType, + BitwiseLeftShiftContigFunctor, bitwise_left_shift_contig_kernel, vec_sz, + n_vecs>(exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p, + res_offset, depends); +} + +template +struct BitwiseLeftShiftContigFactory +{ + fnT get() + { + if constexpr (!BitwiseLeftShiftOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = bitwise_left_shift_contig_impl; + return fn; + } + } +}; + +template +struct BitwiseLeftShiftTypeMapFactory +{ + /*! @brief get typeid for output type of operator()>(x, y), always bool + */ + std::enable_if_t::value, int> get() + { + using rT = typename BitwiseLeftShiftOutputType::value_type; + return td_ns::GetTypeid{}.get(); + } +}; + +template +class bitwise_left_shift_strided_kernel; + +template +sycl::event bitwise_left_shift_strided_impl( + sycl::queue &exec_q, + std::size_t nelems, + int nd, + const ssize_t *shape_and_strides, + const char *arg1_p, + ssize_t arg1_offset, + const char *arg2_p, + ssize_t arg2_offset, + char *res_p, + ssize_t res_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + return elementwise_common::binary_strided_impl< + argTy1, argTy2, BitwiseLeftShiftOutputType, + BitwiseLeftShiftStridedFunctor, bitwise_left_shift_strided_kernel>( + exec_q, nelems, nd, shape_and_strides, arg1_p, arg1_offset, arg2_p, + arg2_offset, res_p, res_offset, depends, additional_depends); +} + +template +struct BitwiseLeftShiftStridedFactory +{ + fnT get() + { + if constexpr (!BitwiseLeftShiftOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = bitwise_left_shift_strided_impl; + return fn; + } + } +}; + +template +struct BitwiseLeftShiftInplaceFunctor +{ + static_assert(std::is_integral_v); + static_assert(!std::is_same_v); + + using supports_sg_loadstore = typename std::true_type; + using supports_vec = typename std::true_type; + + void operator()(resT &res, const argT &in) const { impl(res, in); } + + template + void operator()(sycl::vec &res, + const sycl::vec &in) const + { +#pragma unroll + for (int i = 0; i < vec_sz; ++i) { + impl(res[i], in[i]); + } + } + +private: + void impl(resT &res, const argT &in) const + { + static constexpr argT res_bitsize = static_cast(sizeof(resT) * 8); + static constexpr resT zero = resT(0); + + // bitshift op with second operand negative, or >= bitwidth(argT1) is UB + // array API spec mandates 0 + if constexpr (std::is_unsigned_v) { + (in < res_bitsize) ? (res <<= in) : res = zero; + } + else { + (in < argT(0)) ? res = zero + : ((in < res_bitsize) ? (res <<= in) : res = zero); + } + } +}; + +template +using BitwiseLeftShiftInplaceContigFunctor = + elementwise_common::BinaryInplaceContigFunctor< + argT, + resT, + BitwiseLeftShiftInplaceFunctor, + vec_sz, + n_vecs, + enable_sg_loadstore>; + +template +using BitwiseLeftShiftInplaceStridedFunctor = + elementwise_common::BinaryInplaceStridedFunctor< + argT, + resT, + IndexerT, + BitwiseLeftShiftInplaceFunctor>; + +template +class bitwise_left_shift_inplace_contig_kernel; + +/* @brief Types supported by in-place bitwise left shift */ +template +struct BitwiseLeftShiftInplaceTypePairSupport +{ + /* value if true a kernel for must be instantiated */ + static constexpr bool is_defined = std::disjunction< + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + // fall-through + td_ns::NotDefinedEntry>::is_defined; +}; + +template +struct BitwiseLeftShiftInplaceTypeMapFactory +{ + /*! @brief get typeid for output type of x <<= y */ + std::enable_if_t::value, int> get() + { + if constexpr (BitwiseLeftShiftInplaceTypePairSupport< + argT, resT>::is_defined) { + return td_ns::GetTypeid{}.get(); + } + else { + return td_ns::GetTypeid{}.get(); + } + } +}; + +template +sycl::event bitwise_left_shift_inplace_contig_impl( + sycl::queue &exec_q, + std::size_t nelems, + const char *arg_p, + ssize_t arg_offset, + char *res_p, + ssize_t res_offset, + const std::vector &depends = {}) +{ + using BitwiseLSHS = + hyperparam_detail::BitwiseLeftShiftContigHyperparameterSet; + static constexpr std::uint8_t vec_sz = BitwiseLSHS::vec_sz; + static constexpr std::uint8_t n_vecs = BitwiseLSHS::n_vecs; + + return elementwise_common::binary_inplace_contig_impl< + argTy, resTy, BitwiseLeftShiftInplaceContigFunctor, + bitwise_left_shift_inplace_contig_kernel, vec_sz, n_vecs>( + exec_q, nelems, arg_p, arg_offset, res_p, res_offset, depends); +} + +template +struct BitwiseLeftShiftInplaceContigFactory +{ + fnT get() + { + if constexpr (!BitwiseLeftShiftInplaceTypePairSupport::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = bitwise_left_shift_inplace_contig_impl; + return fn; + } + } +}; + +template +class bitwise_left_shift_inplace_strided_kernel; + +template +sycl::event bitwise_left_shift_inplace_strided_impl( + sycl::queue &exec_q, + std::size_t nelems, + int nd, + const ssize_t *shape_and_strides, + const char *arg_p, + ssize_t arg_offset, + char *res_p, + ssize_t res_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + return elementwise_common::binary_inplace_strided_impl< + argTy, resTy, BitwiseLeftShiftInplaceStridedFunctor, + bitwise_left_shift_inplace_strided_kernel>( + exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p, + res_offset, depends, additional_depends); +} + +template +struct BitwiseLeftShiftInplaceStridedFactory +{ + fnT get() + { + if constexpr (!BitwiseLeftShiftInplaceTypePairSupport::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = bitwise_left_shift_inplace_strided_impl; + return fn; + } + } +}; + +} // namespace dpctl::tensor::kernels::bitwise_left_shift diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/bitwise_or.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/bitwise_or.hpp new file mode 100644 index 000000000000..6714f238ffce --- /dev/null +++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/bitwise_or.hpp @@ -0,0 +1,461 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for elementwise bitwise_or(ar1, ar2) operation. +//===---------------------------------------------------------------------===// + +#pragma once +#include +#include +#include +#include + +#include + +#include "vec_size_util.hpp" + +#include "kernels/dpctl_tensor_types.hpp" +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/common_inplace.hpp" + +#include "utils/type_dispatch_building.hpp" +#include "utils/type_utils.hpp" + +namespace dpctl::tensor::kernels::bitwise_or +{ + +using dpctl::tensor::ssize_t; +namespace td_ns = dpctl::tensor::type_dispatch; + +template +struct BitwiseOrFunctor +{ + static_assert(std::is_same_v); + static_assert(std::is_same_v); + + using supports_sg_loadstore = typename std::true_type; + using supports_vec = typename std::true_type; + + resT operator()(const argT1 &in1, const argT2 &in2) const + { + if constexpr (std::is_same_v) { + return in1 || in2; + } + else { + return (in1 | in2); + } + } + + template + sycl::vec + operator()(const sycl::vec &in1, + const sycl::vec &in2) const + { + + if constexpr (std::is_same_v) { + using dpctl::tensor::type_utils::vec_cast; + + auto tmp = (in1 || in2); + return vec_cast( + tmp); + } + else { + return (in1 | in2); + } + } +}; + +template +using BitwiseOrContigFunctor = elementwise_common::BinaryContigFunctor< + argT1, + argT2, + resT, + BitwiseOrFunctor, + vec_sz, + n_vecs, + enable_sg_loadstore>; + +template +using BitwiseOrStridedFunctor = elementwise_common::BinaryStridedFunctor< + argT1, + argT2, + resT, + IndexerT, + BitwiseOrFunctor>; + +template +struct BitwiseOrOutputType +{ + using value_type = typename std::disjunction< + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::DefaultResultEntry>::result_type; + + static constexpr bool is_defined = !std::is_same_v; +}; + +namespace hyperparam_detail +{ + +namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; + +using vsu_ns::BinaryContigHyperparameterSetEntry; +using vsu_ns::ContigHyperparameterSetDefault; + +template +struct BitwiseOrContigHyperparameterSet +{ + using value_type = + typename std::disjunction>; + + constexpr static auto vec_sz = value_type::vec_sz; + constexpr static auto n_vecs = value_type::n_vecs; +}; + +} // end of namespace hyperparam_detail + +template +class bitwise_or_contig_kernel; + +template +sycl::event bitwise_or_contig_impl(sycl::queue &exec_q, + std::size_t nelems, + const char *arg1_p, + ssize_t arg1_offset, + const char *arg2_p, + ssize_t arg2_offset, + char *res_p, + ssize_t res_offset, + const std::vector &depends = {}) +{ + using BitwiseOrHS = + hyperparam_detail::BitwiseOrContigHyperparameterSet; + static constexpr std::uint8_t vec_sz = BitwiseOrHS::vec_sz; + static constexpr std::uint8_t n_vecs = BitwiseOrHS::n_vecs; + + return elementwise_common::binary_contig_impl< + argTy1, argTy2, BitwiseOrOutputType, BitwiseOrContigFunctor, + bitwise_or_contig_kernel, vec_sz, n_vecs>( + exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p, + res_offset, depends); +} + +template +struct BitwiseOrContigFactory +{ + fnT get() + { + if constexpr (!BitwiseOrOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = bitwise_or_contig_impl; + return fn; + } + } +}; + +template +struct BitwiseOrTypeMapFactory +{ + /*! @brief get typeid for output type of operator()>(x, y), always bool + */ + std::enable_if_t::value, int> get() + { + using rT = typename BitwiseOrOutputType::value_type; + return td_ns::GetTypeid{}.get(); + } +}; + +template +class bitwise_or_strided_kernel; + +template +sycl::event + bitwise_or_strided_impl(sycl::queue &exec_q, + std::size_t nelems, + int nd, + const ssize_t *shape_and_strides, + const char *arg1_p, + ssize_t arg1_offset, + const char *arg2_p, + ssize_t arg2_offset, + char *res_p, + ssize_t res_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + return elementwise_common::binary_strided_impl< + argTy1, argTy2, BitwiseOrOutputType, BitwiseOrStridedFunctor, + bitwise_or_strided_kernel>( + exec_q, nelems, nd, shape_and_strides, arg1_p, arg1_offset, arg2_p, + arg2_offset, res_p, res_offset, depends, additional_depends); +} + +template +struct BitwiseOrStridedFactory +{ + fnT get() + { + if constexpr (!BitwiseOrOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = bitwise_or_strided_impl; + return fn; + } + } +}; + +template +struct BitwiseOrInplaceFunctor +{ + using supports_sg_loadstore = typename std::true_type; + using supports_vec = typename std::true_type; + + void operator()(resT &res, const argT &in) const + { + if constexpr (std::is_same_v) { + res = res || in; + } + else { + res |= in; + } + } + + template + void operator()(sycl::vec &res, + const sycl::vec &in) const + { + + if constexpr (std::is_same_v) { + using dpctl::tensor::type_utils::vec_cast; + + auto tmp = (res || in); + res = vec_cast( + tmp); + } + else { + res |= in; + } + } +}; + +template +using BitwiseOrInplaceContigFunctor = + elementwise_common::BinaryInplaceContigFunctor< + argT, + resT, + BitwiseOrInplaceFunctor, + vec_sz, + n_vecs, + enable_sg_loadstore>; + +template +using BitwiseOrInplaceStridedFunctor = + elementwise_common::BinaryInplaceStridedFunctor< + argT, + resT, + IndexerT, + BitwiseOrInplaceFunctor>; + +template +class bitwise_or_inplace_contig_kernel; + +/* @brief Types supported by in-place bitwise OR */ +template +struct BitwiseOrInplaceTypePairSupport +{ + /* value if true a kernel for must be instantiated */ + static constexpr bool is_defined = std::disjunction< + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + // fall-through + td_ns::NotDefinedEntry>::is_defined; +}; + +template +struct BitwiseOrInplaceTypeMapFactory +{ + /*! @brief get typeid for output type of x |= y */ + std::enable_if_t::value, int> get() + { + if constexpr (BitwiseOrInplaceTypePairSupport::is_defined) { + return td_ns::GetTypeid{}.get(); + } + else { + return td_ns::GetTypeid{}.get(); + } + } +}; + +template +sycl::event + bitwise_or_inplace_contig_impl(sycl::queue &exec_q, + std::size_t nelems, + const char *arg_p, + ssize_t arg_offset, + char *res_p, + ssize_t res_offset, + const std::vector &depends = {}) +{ + using BitwiseOrHS = + hyperparam_detail::BitwiseOrContigHyperparameterSet; + + static constexpr std::uint8_t vec_sz = BitwiseOrHS::vec_sz; + static constexpr std::uint8_t n_vecs = BitwiseOrHS::n_vecs; + + return elementwise_common::binary_inplace_contig_impl< + argTy, resTy, BitwiseOrInplaceContigFunctor, + bitwise_or_inplace_contig_kernel, vec_sz, n_vecs>( + exec_q, nelems, arg_p, arg_offset, res_p, res_offset, depends); +} + +template +struct BitwiseOrInplaceContigFactory +{ + fnT get() + { + if constexpr (!BitwiseOrInplaceTypePairSupport::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = bitwise_or_inplace_contig_impl; + return fn; + } + } +}; + +template +class bitwise_or_inplace_strided_kernel; + +template +sycl::event bitwise_or_inplace_strided_impl( + sycl::queue &exec_q, + std::size_t nelems, + int nd, + const ssize_t *shape_and_strides, + const char *arg_p, + ssize_t arg_offset, + char *res_p, + ssize_t res_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + return elementwise_common::binary_inplace_strided_impl< + argTy, resTy, BitwiseOrInplaceStridedFunctor, + bitwise_or_inplace_strided_kernel>( + exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p, + res_offset, depends, additional_depends); +} + +template +struct BitwiseOrInplaceStridedFactory +{ + fnT get() + { + if constexpr (!BitwiseOrInplaceTypePairSupport::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = bitwise_or_inplace_strided_impl; + return fn; + } + } +}; + +} // namespace dpctl::tensor::kernels::bitwise_or diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/bitwise_right_shift.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/bitwise_right_shift.hpp new file mode 100644 index 000000000000..241852b6a06e --- /dev/null +++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/bitwise_right_shift.hpp @@ -0,0 +1,487 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for elementwise bitwise_right_shift(ar1, ar2) +/// operation. +//===---------------------------------------------------------------------===// + +#pragma once +#include +#include +#include +#include + +#include + +#include "vec_size_util.hpp" + +#include "kernels/dpctl_tensor_types.hpp" +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/common_inplace.hpp" + +#include "utils/type_dispatch_building.hpp" + +namespace dpctl::tensor::kernels::bitwise_right_shift +{ + +using dpctl::tensor::ssize_t; +namespace td_ns = dpctl::tensor::type_dispatch; + +template +struct BitwiseRightShiftFunctor +{ + static_assert(std::is_same_v); + static_assert(std::is_integral_v); + static_assert(std::is_integral_v); + + using supports_sg_loadstore = typename std::true_type; + using supports_vec = typename std::true_type; + + resT operator()(const argT1 &in1, const argT2 &in2) const + { + return impl(in1, in2); + } + + template + sycl::vec + operator()(const sycl::vec &in1, + const sycl::vec &in2) const + { + sycl::vec res; +#pragma unroll + for (int i = 0; i < vec_sz; ++i) { + res[i] = impl(in1[i], in2[i]); + } + return res; + } + +private: + resT impl(const argT1 &in1, const argT2 &in2) const + { + static constexpr argT2 in1_bitsize = + static_cast(sizeof(argT1) * 8); + static constexpr resT zero = resT(0); + + // bitshift op with second operand negative, or >= bitwidth(argT1) is UB + // array API spec mandates 0 + if constexpr (std::is_unsigned_v) { + return (in2 < in1_bitsize) ? (in1 >> in2) : zero; + } + else { + return (in2 < argT2(0)) + ? zero + : ((in2 < in1_bitsize) + ? (in1 >> in2) + : (in1 < argT1(0) ? resT(-1) : zero)); + } + } +}; + +template +using BitwiseRightShiftContigFunctor = elementwise_common::BinaryContigFunctor< + argT1, + argT2, + resT, + BitwiseRightShiftFunctor, + vec_sz, + n_vecs, + enable_sg_loadstore>; + +template +using BitwiseRightShiftStridedFunctor = + elementwise_common::BinaryStridedFunctor< + argT1, + argT2, + resT, + IndexerT, + BitwiseRightShiftFunctor>; + +template +struct BitwiseRightShiftOutputType +{ + using ResT = T1; + using value_type = typename std::disjunction< + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::DefaultResultEntry>::result_type; + + static constexpr bool is_defined = !std::is_same_v; +}; + +namespace hyperparam_detail +{ + +namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; + +using vsu_ns::BinaryContigHyperparameterSetEntry; +using vsu_ns::ContigHyperparameterSetDefault; + +template +struct BitwiseRightShiftContigHyperparameterSet +{ + using value_type = + typename std::disjunction>; + + constexpr static auto vec_sz = value_type::vec_sz; + constexpr static auto n_vecs = value_type::n_vecs; +}; + +} // namespace hyperparam_detail + +template +class bitwise_right_shift_contig_kernel; + +template +sycl::event bitwise_right_shift_contig_impl( + sycl::queue &exec_q, + std::size_t nelems, + const char *arg1_p, + ssize_t arg1_offset, + const char *arg2_p, + ssize_t arg2_offset, + char *res_p, + ssize_t res_offset, + const std::vector &depends = {}) +{ + using BitwiseRSHS = + hyperparam_detail::BitwiseRightShiftContigHyperparameterSet; + constexpr std::uint8_t vec_sz = BitwiseRSHS::vec_sz; + constexpr std::uint8_t n_vecs = BitwiseRSHS::n_vecs; + + return elementwise_common::binary_contig_impl< + argTy1, argTy2, BitwiseRightShiftOutputType, + BitwiseRightShiftContigFunctor, bitwise_right_shift_contig_kernel, + vec_sz, n_vecs>(exec_q, nelems, arg1_p, arg1_offset, arg2_p, + arg2_offset, res_p, res_offset, depends); +} + +template +struct BitwiseRightShiftContigFactory +{ + fnT get() + { + if constexpr (!BitwiseRightShiftOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = bitwise_right_shift_contig_impl; + return fn; + } + } +}; + +template +struct BitwiseRightShiftTypeMapFactory +{ + /*! @brief get typeid for output type of operator()>(x, y), always bool + */ + std::enable_if_t::value, int> get() + { + using rT = typename BitwiseRightShiftOutputType::value_type; + return td_ns::GetTypeid{}.get(); + } +}; + +template +class bitwise_right_shift_strided_kernel; + +template +sycl::event bitwise_right_shift_strided_impl( + sycl::queue &exec_q, + std::size_t nelems, + int nd, + const ssize_t *shape_and_strides, + const char *arg1_p, + ssize_t arg1_offset, + const char *arg2_p, + ssize_t arg2_offset, + char *res_p, + ssize_t res_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + return elementwise_common::binary_strided_impl< + argTy1, argTy2, BitwiseRightShiftOutputType, + BitwiseRightShiftStridedFunctor, bitwise_right_shift_strided_kernel>( + exec_q, nelems, nd, shape_and_strides, arg1_p, arg1_offset, arg2_p, + arg2_offset, res_p, res_offset, depends, additional_depends); +} + +template +struct BitwiseRightShiftStridedFactory +{ + fnT get() + { + if constexpr (!BitwiseRightShiftOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = bitwise_right_shift_strided_impl; + return fn; + } + } +}; + +template +struct BitwiseRightShiftInplaceFunctor +{ + static_assert(std::is_integral_v); + static_assert(!std::is_same_v); + + using supports_sg_loadstore = typename std::true_type; + using supports_vec = typename std::true_type; + + void operator()(resT &res, const argT &in) const { impl(res, in); } + + template + void operator()(sycl::vec &res, + const sycl::vec &in) const + { +#pragma unroll + for (int i = 0; i < vec_sz; ++i) { + impl(res[i], in[i]); + } + } + +private: + void impl(resT &res, const argT &in) const + { + static constexpr argT res_bitsize = static_cast(sizeof(resT) * 8); + static constexpr resT zero = resT(0); + + // bitshift op with second operand negative, or >= bitwidth(argT1) is UB + // array API spec mandates 0 + if constexpr (std::is_unsigned_v) { + (in < res_bitsize) ? (res >>= in) : res = zero; + } + else { + (in < argT(0)) ? res = zero + : ((in < res_bitsize) ? (res >>= in) + : (res < resT(0)) ? res = resT(-1) + : res = zero); + } + } +}; + +template +using BitwiseRightShiftInplaceContigFunctor = + elementwise_common::BinaryInplaceContigFunctor< + argT, + resT, + BitwiseRightShiftInplaceFunctor, + vec_sz, + n_vecs, + enable_sg_loadstore>; + +template +using BitwiseRightShiftInplaceStridedFunctor = + elementwise_common::BinaryInplaceStridedFunctor< + argT, + resT, + IndexerT, + BitwiseRightShiftInplaceFunctor>; + +template +class bitwise_right_shift_inplace_contig_kernel; + +/* @brief Types supported by in-place bitwise right shift */ +template +struct BitwiseRightShiftInplaceTypePairSupport +{ + /* value if true a kernel for must be instantiated */ + static constexpr bool is_defined = std::disjunction< + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + // fall-through + td_ns::NotDefinedEntry>::is_defined; +}; + +template +struct BitwiseRightShiftInplaceTypeMapFactory +{ + /*! @brief get typeid for output type of x >>= y */ + std::enable_if_t::value, int> get() + { + if constexpr (BitwiseRightShiftInplaceTypePairSupport< + argT, resT>::is_defined) { + return td_ns::GetTypeid{}.get(); + } + else { + return td_ns::GetTypeid{}.get(); + } + } +}; + +template +sycl::event bitwise_right_shift_inplace_contig_impl( + sycl::queue &exec_q, + std::size_t nelems, + const char *arg_p, + ssize_t arg_offset, + char *res_p, + ssize_t res_offset, + const std::vector &depends = {}) +{ + using BitwiseRSHS = + hyperparam_detail::BitwiseRightShiftContigHyperparameterSet; + + // res = OP(res, arg) + static constexpr std::uint8_t vec_sz = BitwiseRSHS::vec_sz; + static constexpr std::uint8_t n_vecs = BitwiseRSHS::n_vecs; + + return elementwise_common::binary_inplace_contig_impl< + argTy, resTy, BitwiseRightShiftInplaceContigFunctor, + bitwise_right_shift_inplace_contig_kernel, vec_sz, n_vecs>( + exec_q, nelems, arg_p, arg_offset, res_p, res_offset, depends); +} + +template +struct BitwiseRightShiftInplaceContigFactory +{ + fnT get() + { + if constexpr (!BitwiseRightShiftInplaceTypePairSupport< + T1, T2>::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = bitwise_right_shift_inplace_contig_impl; + return fn; + } + } +}; + +template +class bitwise_right_shift_inplace_strided_kernel; + +template +sycl::event bitwise_right_shift_inplace_strided_impl( + sycl::queue &exec_q, + std::size_t nelems, + int nd, + const ssize_t *shape_and_strides, + const char *arg_p, + ssize_t arg_offset, + char *res_p, + ssize_t res_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + return elementwise_common::binary_inplace_strided_impl< + argTy, resTy, BitwiseRightShiftInplaceStridedFunctor, + bitwise_right_shift_inplace_strided_kernel>( + exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p, + res_offset, depends, additional_depends); +} + +template +struct BitwiseRightShiftInplaceStridedFactory +{ + fnT get() + { + if constexpr (!BitwiseRightShiftInplaceTypePairSupport< + T1, T2>::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = bitwise_right_shift_inplace_strided_impl; + return fn; + } + } +}; + +} // namespace dpctl::tensor::kernels::bitwise_right_shift diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/bitwise_xor.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/bitwise_xor.hpp new file mode 100644 index 000000000000..292cf3f76df6 --- /dev/null +++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/bitwise_xor.hpp @@ -0,0 +1,465 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for elementwise bitwise_xor(ar1, ar2) operation. +//===---------------------------------------------------------------------===// + +#pragma once +#include +#include +#include +#include + +#include + +#include "vec_size_util.hpp" + +#include "kernels/dpctl_tensor_types.hpp" +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/common_inplace.hpp" + +#include "utils/type_dispatch_building.hpp" +#include "utils/type_utils.hpp" + +namespace dpctl::tensor::kernels::bitwise_xor +{ + +using dpctl::tensor::ssize_t; +namespace td_ns = dpctl::tensor::type_dispatch; + +template +struct BitwiseXorFunctor +{ + static_assert(std::is_same_v); + static_assert(std::is_same_v); + + using supports_sg_loadstore = typename std::true_type; + using supports_vec = typename std::true_type; + + resT operator()(const argT1 &in1, const argT2 &in2) const + { + if constexpr (std::is_same_v) { + // (false != false) -> false, (false != true) -> true + // (true != false) -> true, (true != true) -> false + return (in1 != in2); + } + else { + return (in1 ^ in2); + } + } + + template + sycl::vec + operator()(const sycl::vec &in1, + const sycl::vec &in2) const + { + + if constexpr (std::is_same_v) { + using dpctl::tensor::type_utils::vec_cast; + + auto tmp = (in1 != in2); + return vec_cast( + tmp); + } + else { + return (in1 ^ in2); + } + } +}; + +template +using BitwiseXorContigFunctor = elementwise_common::BinaryContigFunctor< + argT1, + argT2, + resT, + BitwiseXorFunctor, + vec_sz, + n_vecs, + enable_sg_loadstore>; + +template +using BitwiseXorStridedFunctor = elementwise_common::BinaryStridedFunctor< + argT1, + argT2, + resT, + IndexerT, + BitwiseXorFunctor>; + +template +struct BitwiseXorOutputType +{ + using value_type = typename std::disjunction< + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::DefaultResultEntry>::result_type; + + static constexpr bool is_defined = !std::is_same_v; +}; + +namespace hyperparam_detail +{ + +namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; + +using vsu_ns::BinaryContigHyperparameterSetEntry; +using vsu_ns::ContigHyperparameterSetDefault; + +template +struct BitwiseXorContigHyperparameterSet +{ + using value_type = + typename std::disjunction>; + + constexpr static auto vec_sz = value_type::vec_sz; + constexpr static auto n_vecs = value_type::n_vecs; +}; + +} // end of namespace hyperparam_detail + +template +class bitwise_xor_contig_kernel; + +template +sycl::event + bitwise_xor_contig_impl(sycl::queue &exec_q, + std::size_t nelems, + const char *arg1_p, + ssize_t arg1_offset, + const char *arg2_p, + ssize_t arg2_offset, + char *res_p, + ssize_t res_offset, + const std::vector &depends = {}) +{ + using BitwiseXorHS = + hyperparam_detail::BitwiseXorContigHyperparameterSet; + static constexpr std::uint8_t vec_sz = BitwiseXorHS::vec_sz; + static constexpr std::uint8_t n_vecs = BitwiseXorHS::n_vecs; + + return elementwise_common::binary_contig_impl< + argTy1, argTy2, BitwiseXorOutputType, BitwiseXorContigFunctor, + bitwise_xor_contig_kernel, vec_sz, n_vecs>( + exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p, + res_offset, depends); +} + +template +struct BitwiseXorContigFactory +{ + fnT get() + { + if constexpr (!BitwiseXorOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = bitwise_xor_contig_impl; + return fn; + } + } +}; + +template +struct BitwiseXorTypeMapFactory +{ + /*! @brief get typeid for output type of operator()>(x, y), always bool + */ + std::enable_if_t::value, int> get() + { + using rT = typename BitwiseXorOutputType::value_type; + return td_ns::GetTypeid{}.get(); + } +}; + +template +class bitwise_xor_strided_kernel; + +template +sycl::event + bitwise_xor_strided_impl(sycl::queue &exec_q, + std::size_t nelems, + int nd, + const ssize_t *shape_and_strides, + const char *arg1_p, + ssize_t arg1_offset, + const char *arg2_p, + ssize_t arg2_offset, + char *res_p, + ssize_t res_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + return elementwise_common::binary_strided_impl< + argTy1, argTy2, BitwiseXorOutputType, BitwiseXorStridedFunctor, + bitwise_xor_strided_kernel>( + exec_q, nelems, nd, shape_and_strides, arg1_p, arg1_offset, arg2_p, + arg2_offset, res_p, res_offset, depends, additional_depends); +} + +template +struct BitwiseXorStridedFactory +{ + fnT get() + { + if constexpr (!BitwiseXorOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = bitwise_xor_strided_impl; + return fn; + } + } +}; + +template +struct BitwiseXorInplaceFunctor +{ + using supports_sg_loadstore = typename std::true_type; + using supports_vec = typename std::true_type; + + void operator()(resT &res, const argT &in) const + { + if constexpr (std::is_same_v) { + res = (res != in); + } + else { + res ^= in; + } + } + + template + void operator()(sycl::vec &res, + const sycl::vec &in) const + { + + if constexpr (std::is_same_v) { + using dpctl::tensor::type_utils::vec_cast; + + auto tmp = (res != in); + res = vec_cast( + tmp); + } + else { + res ^= in; + } + } +}; + +template +using BitwiseXorInplaceContigFunctor = + elementwise_common::BinaryInplaceContigFunctor< + argT, + resT, + BitwiseXorInplaceFunctor, + vec_sz, + n_vecs, + enable_sg_loadstore>; + +template +using BitwiseXorInplaceStridedFunctor = + elementwise_common::BinaryInplaceStridedFunctor< + argT, + resT, + IndexerT, + BitwiseXorInplaceFunctor>; + +template +class bitwise_xor_inplace_contig_kernel; + +/* @brief Types supported by in-place bitwise XOR */ +template +struct BitwiseXorInplaceTypePairSupport +{ + /* value if true a kernel for must be instantiated */ + static constexpr bool is_defined = std::disjunction< + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + // fall-through + td_ns::NotDefinedEntry>::is_defined; +}; + +template +struct BitwiseXorInplaceTypeMapFactory +{ + /*! @brief get typeid for output type of x ^= y */ + std::enable_if_t::value, int> get() + { + if constexpr (BitwiseXorInplaceTypePairSupport::is_defined) { + return td_ns::GetTypeid{}.get(); + } + else { + return td_ns::GetTypeid{}.get(); + } + } +}; + +template +sycl::event bitwise_xor_inplace_contig_impl( + sycl::queue &exec_q, + std::size_t nelems, + const char *arg_p, + ssize_t arg_offset, + char *res_p, + ssize_t res_offset, + const std::vector &depends = {}) +{ + using BitwiseXorHS = + hyperparam_detail::BitwiseXorContigHyperparameterSet; + + static constexpr std::uint8_t vec_sz = BitwiseXorHS::vec_sz; + static constexpr std::uint8_t n_vecs = BitwiseXorHS::n_vecs; + + return elementwise_common::binary_inplace_contig_impl< + argTy, resTy, BitwiseXorInplaceContigFunctor, + bitwise_xor_inplace_contig_kernel, vec_sz, n_vecs>( + exec_q, nelems, arg_p, arg_offset, res_p, res_offset, depends); +} + +template +struct BitwiseXorInplaceContigFactory +{ + fnT get() + { + if constexpr (!BitwiseXorInplaceTypePairSupport::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = bitwise_xor_inplace_contig_impl; + return fn; + } + } +}; + +template +class bitwise_xor_inplace_strided_kernel; + +template +sycl::event bitwise_xor_inplace_strided_impl( + sycl::queue &exec_q, + std::size_t nelems, + int nd, + const ssize_t *shape_and_strides, + const char *arg_p, + ssize_t arg_offset, + char *res_p, + ssize_t res_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + return elementwise_common::binary_inplace_strided_impl< + argTy, resTy, BitwiseXorInplaceStridedFunctor, + bitwise_xor_inplace_strided_kernel>( + exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p, + res_offset, depends, additional_depends); +} + +template +struct BitwiseXorInplaceStridedFactory +{ + fnT get() + { + if constexpr (!BitwiseXorInplaceTypePairSupport::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = bitwise_xor_inplace_strided_impl; + return fn; + } + } +}; + +} // namespace dpctl::tensor::kernels::bitwise_xor diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/cabs_impl.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/cabs_impl.hpp new file mode 100644 index 000000000000..ae632061571f --- /dev/null +++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/cabs_impl.hpp @@ -0,0 +1,77 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines an implementation of the complex absolute value. +//===---------------------------------------------------------------------===// + +#pragma once +#include +#include +#include + +#include "sycl_complex.hpp" + +namespace dpctl::tensor::kernels::detail +{ + +template +realT cabs(std::complex const &z) +{ + // Special values for cabs( x + y * 1j): + // * If x is either +infinity or -infinity and y is any value + // (including NaN), the result is +infinity. + // * If x is any value (including NaN) and y is either +infinity or + // -infinity, the result is +infinity. + // * If x is either +0 or -0, the result is equal to abs(y). + // * If y is either +0 or -0, the result is equal to abs(x). + // * If x is NaN and y is a finite number, the result is NaN. + // * If x is a finite number and y is NaN, the result is NaN. + // * If x is NaN and y is NaN, the result is NaN. + + const realT x = std::real(z); + const realT y = std::imag(z); + + static constexpr realT q_nan = std::numeric_limits::quiet_NaN(); + static constexpr realT p_inf = std::numeric_limits::infinity(); + + const realT res = + std::isinf(x) + ? p_inf + : ((std::isinf(y) + ? p_inf + : ((std::isnan(x) + ? q_nan + : exprm_ns::abs(exprm_ns::complex(z)))))); + + return res; +} + +} // namespace dpctl::tensor::kernels::detail diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/cbrt.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/cbrt.hpp new file mode 100644 index 000000000000..20fb0ea7bcda --- /dev/null +++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/cbrt.hpp @@ -0,0 +1,206 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for elementwise evaluation of CBRT(x) +/// function that computes a cube root. +//===---------------------------------------------------------------------===// + +#pragma once +#include +#include +#include +#include + +#include + +#include "vec_size_util.hpp" + +#include "kernels/dpctl_tensor_types.hpp" +#include "kernels/elementwise_functions/common.hpp" + +#include "utils/type_dispatch_building.hpp" + +namespace dpctl::tensor::kernels::cbrt +{ + +using dpctl::tensor::ssize_t; +namespace td_ns = dpctl::tensor::type_dispatch; + +template +struct CbrtFunctor +{ + + // is function constant for given argT + using is_constant = typename std::false_type; + // constant value, if constant + // constexpr resT constant_value = resT{}; + // is function defined for sycl::vec + using supports_vec = typename std::false_type; + // do both argTy and resTy support sugroup store/load operation + using supports_sg_loadstore = typename std::true_type; + + resT operator()(const argT &in) const { return sycl::cbrt(in); } +}; + +template +using CbrtContigFunctor = + elementwise_common::UnaryContigFunctor, + vec_sz, + n_vecs, + enable_sg_loadstore>; + +template +using CbrtStridedFunctor = elementwise_common:: + UnaryStridedFunctor>; + +template +struct CbrtOutputType +{ + using value_type = typename std::disjunction< + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::DefaultResultEntry>::result_type; + + static constexpr bool is_defined = !std::is_same_v; +}; + +namespace hyperparam_detail +{ + +namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; + +using vsu_ns::ContigHyperparameterSetDefault; +using vsu_ns::UnaryContigHyperparameterSetEntry; + +template +struct CbrtContigHyperparameterSet +{ + using value_type = + typename std::disjunction>; + + constexpr static auto vec_sz = value_type::vec_sz; + constexpr static auto n_vecs = value_type::n_vecs; +}; + +} // end of namespace hyperparam_detail + +template +class cbrt_contig_kernel; + +template +sycl::event cbrt_contig_impl(sycl::queue &exec_q, + std::size_t nelems, + const char *arg_p, + char *res_p, + const std::vector &depends = {}) +{ + using CbrtHS = hyperparam_detail::CbrtContigHyperparameterSet; + static constexpr std::uint8_t vec_sz = CbrtHS::vec_sz; + static constexpr std::uint8_t n_vecs = CbrtHS::n_vecs; + + return elementwise_common::unary_contig_impl< + argTy, CbrtOutputType, CbrtContigFunctor, cbrt_contig_kernel, vec_sz, + n_vecs>(exec_q, nelems, arg_p, res_p, depends); +} + +template +struct CbrtContigFactory +{ + fnT get() + { + if constexpr (!CbrtOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = cbrt_contig_impl; + return fn; + } + } +}; + +template +struct CbrtTypeMapFactory +{ + /*! @brief get typeid for output type of std::cbrt(T x) */ + std::enable_if_t::value, int> get() + { + using rT = typename CbrtOutputType::value_type; + return td_ns::GetTypeid{}.get(); + } +}; + +template +class cbrt_strided_kernel; + +template +sycl::event + cbrt_strided_impl(sycl::queue &exec_q, + std::size_t nelems, + int nd, + const ssize_t *shape_and_strides, + const char *arg_p, + ssize_t arg_offset, + char *res_p, + ssize_t res_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + return elementwise_common::unary_strided_impl< + argTy, CbrtOutputType, CbrtStridedFunctor, cbrt_strided_kernel>( + exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p, + res_offset, depends, additional_depends); +} + +template +struct CbrtStridedFactory +{ + fnT get() + { + if constexpr (!CbrtOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = cbrt_strided_impl; + return fn; + } + } +}; + +} // namespace dpctl::tensor::kernels::cbrt diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/ceil.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/ceil.hpp new file mode 100644 index 000000000000..08fd4da2fb50 --- /dev/null +++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/ceil.hpp @@ -0,0 +1,230 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for elementwise evaluation of CEIL(x) function. +//===---------------------------------------------------------------------===// + +#pragma once + +#include +#include +#include +#include + +#include + +#include "vec_size_util.hpp" + +#include "kernels/dpctl_tensor_types.hpp" +#include "kernels/elementwise_functions/common.hpp" + +#include "utils/type_dispatch_building.hpp" +#include "utils/type_utils.hpp" + +namespace dpctl::tensor::kernels::ceil +{ + +using dpctl::tensor::ssize_t; +namespace td_ns = dpctl::tensor::type_dispatch; + +using dpctl::tensor::type_utils::is_complex; + +template +struct CeilFunctor +{ + + // is function constant for given argT + using is_constant = typename std::false_type; + // constant value, if constant + // constexpr resT constant_value = resT{}; + // is function defined for sycl::vec + using supports_vec = typename std::false_type; + // do both argTy and resTy support sugroup store/load operation + using supports_sg_loadstore = typename std::negation< + std::disjunction, is_complex>>; + + resT operator()(const argT &in) const + { + if constexpr (std::is_integral_v) { + return in; + } + else { + if (in == 0) { + return in; + } + return sycl::ceil(in); + } + } +}; + +template +using CeilContigFunctor = + elementwise_common::UnaryContigFunctor, + vec_sz, + n_vecs, + enable_sg_loadstore>; + +template +using CeilStridedFunctor = elementwise_common:: + UnaryStridedFunctor>; + +template +struct CeilOutputType +{ + using value_type = + typename std::disjunction, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::DefaultResultEntry>::result_type; + + static constexpr bool is_defined = !std::is_same_v; +}; + +namespace hyperparam_detail +{ + +namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; + +using vsu_ns::ContigHyperparameterSetDefault; +using vsu_ns::UnaryContigHyperparameterSetEntry; + +template +struct CeilContigHyperparameterSet +{ + using value_type = + typename std::disjunction>; + + constexpr static auto vec_sz = value_type::vec_sz; + constexpr static auto n_vecs = value_type::n_vecs; +}; + +} // end of namespace hyperparam_detail + +template +class ceil_contig_kernel; + +template +sycl::event ceil_contig_impl(sycl::queue &exec_q, + std::size_t nelems, + const char *arg_p, + char *res_p, + const std::vector &depends = {}) +{ + using CeilHS = hyperparam_detail::CeilContigHyperparameterSet; + static constexpr std::uint8_t vec_sz = CeilHS::vec_sz; + static constexpr std::uint8_t n_vecs = CeilHS::n_vecs; + + return elementwise_common::unary_contig_impl< + argTy, CeilOutputType, CeilContigFunctor, ceil_contig_kernel, vec_sz, + n_vecs>(exec_q, nelems, arg_p, res_p, depends); +} + +template +struct CeilContigFactory +{ + fnT get() + { + if constexpr (!CeilOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = ceil_contig_impl; + return fn; + } + } +}; + +template +struct CeilTypeMapFactory +{ + /*! @brief get typeid for output type of sycl::ceil(T x) */ + std::enable_if_t::value, int> get() + { + using rT = typename CeilOutputType::value_type; + return td_ns::GetTypeid{}.get(); + } +}; + +template +class ceil_strided_kernel; + +template +sycl::event + ceil_strided_impl(sycl::queue &exec_q, + std::size_t nelems, + int nd, + const ssize_t *shape_and_strides, + const char *arg_p, + ssize_t arg_offset, + char *res_p, + ssize_t res_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + return elementwise_common::unary_strided_impl< + argTy, CeilOutputType, CeilStridedFunctor, ceil_strided_kernel>( + exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p, + res_offset, depends, additional_depends); +} + +template +struct CeilStridedFactory +{ + fnT get() + { + if constexpr (!CeilOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = ceil_strided_impl; + return fn; + } + } +}; + +} // namespace dpctl::tensor::kernels::ceil diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/common.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/common.hpp new file mode 100644 index 000000000000..cfe3f4898491 --- /dev/null +++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/common.hpp @@ -0,0 +1,1036 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +/// +/// \file +/// This file defines common code for elementwise tensor operations. +//===---------------------------------------------------------------------===// + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include + +#include "common_detail.hpp" + +#include "utils/offset_utils.hpp" +#include "utils/sycl_alloc_utils.hpp" +#include "utils/sycl_utils.hpp" + +#include "kernels/alignment.hpp" +#include "kernels/dpctl_tensor_types.hpp" + +namespace dpctl::tensor::kernels::elementwise_common +{ +using dpctl::tensor::ssize_t; +using dpctl::tensor::kernels::alignment_utils:: + disabled_sg_loadstore_wrapper_krn; +using dpctl::tensor::kernels::alignment_utils::is_aligned; +using dpctl::tensor::kernels::alignment_utils::required_alignment; + +using dpctl::tensor::sycl_utils::sub_group_load; +using dpctl::tensor::sycl_utils::sub_group_store; + +/*! @brief Functor for unary function evaluation on contiguous array */ +template +struct UnaryContigFunctor +{ +private: + const argT *in = nullptr; + resT *out = nullptr; + std::size_t nelems_; + +public: + UnaryContigFunctor(const argT *inp, resT *res, const std::size_t n_elems) + : in(inp), out(res), nelems_(n_elems) + { + } + + void operator()(sycl::nd_item<1> ndit) const + { + static constexpr std::uint8_t elems_per_wi = n_vecs * vec_sz; + UnaryOperatorT op{}; + /* Each work-item processes vec_sz elements, contiguous in memory */ + /* NOTE: work-group size must be divisible by sub-group size */ + + if constexpr (enable_sg_loadstore && + UnaryOperatorT::is_constant::value) { + // value of operator is known to be a known constant + constexpr resT const_val = UnaryOperatorT::constant_value; + + auto sg = ndit.get_sub_group(); + const std::uint16_t sgSize = sg.get_max_local_range()[0]; + + const std::size_t base = + elems_per_wi * (ndit.get_group(0) * ndit.get_local_range(0) + + sg.get_group_id()[0] * sgSize); + if (base + elems_per_wi * sgSize < nelems_) { + static constexpr sycl::vec res_vec(const_val); +#pragma unroll + for (std::uint8_t it = 0; it < elems_per_wi; it += vec_sz) { + const std::size_t offset = base + it * sgSize; + auto out_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>(&out[offset]); + + sub_group_store(sg, res_vec, out_multi_ptr); + } + } + else { + const std::size_t lane_id = sg.get_local_id()[0]; + for (std::size_t k = base + lane_id; k < nelems_; k += sgSize) { + out[k] = const_val; + } + } + } + else if constexpr (enable_sg_loadstore && + UnaryOperatorT::supports_sg_loadstore::value && + UnaryOperatorT::supports_vec::value && + (vec_sz > 1)) { + auto sg = ndit.get_sub_group(); + const std::uint16_t sgSize = sg.get_max_local_range()[0]; + + const std::size_t base = + elems_per_wi * (ndit.get_group(0) * ndit.get_local_range(0) + + sg.get_group_id()[0] * sgSize); + if (base + elems_per_wi * sgSize < nelems_) { +#pragma unroll + for (std::uint8_t it = 0; it < elems_per_wi; it += vec_sz) { + const std::size_t offset = base + it * sgSize; + auto in_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>(&in[offset]); + auto out_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>(&out[offset]); + + const sycl::vec x = + sub_group_load(sg, in_multi_ptr); + const sycl::vec res_vec = op(x); + sub_group_store(sg, res_vec, out_multi_ptr); + } + } + else { + const std::size_t lane_id = sg.get_local_id()[0]; + for (std::size_t k = base + lane_id; k < nelems_; k += sgSize) { + // scalar call + out[k] = op(in[k]); + } + } + } + else if constexpr (enable_sg_loadstore && + UnaryOperatorT::supports_sg_loadstore::value && + std::is_same_v) { + // default: use scalar-value function + + auto sg = ndit.get_sub_group(); + const std::uint16_t sgSize = sg.get_max_local_range()[0]; + const std::size_t base = + elems_per_wi * (ndit.get_group(0) * ndit.get_local_range(0) + + sg.get_group_id()[0] * sgSize); + + if (base + elems_per_wi * sgSize < nelems_) { +#pragma unroll + for (std::uint8_t it = 0; it < elems_per_wi; it += vec_sz) { + const std::size_t offset = base + it * sgSize; + auto in_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>(&in[offset]); + auto out_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>(&out[offset]); + + sycl::vec arg_vec = + sub_group_load(sg, in_multi_ptr); +#pragma unroll + for (std::uint32_t k = 0; k < vec_sz; ++k) { + arg_vec[k] = op(arg_vec[k]); + } + sub_group_store(sg, arg_vec, out_multi_ptr); + } + } + else { + const std::size_t lane_id = sg.get_local_id()[0]; + for (std::size_t k = base + lane_id; k < nelems_; k += sgSize) { + out[k] = op(in[k]); + } + } + } + else if constexpr (enable_sg_loadstore && + UnaryOperatorT::supports_sg_loadstore::value) { + // default: use scalar-value function + + auto sg = ndit.get_sub_group(); + const std::uint16_t sgSize = sg.get_max_local_range()[0]; + const std::size_t base = + elems_per_wi * (ndit.get_group(0) * ndit.get_local_range(0) + + sg.get_group_id()[0] * sgSize); + + if (base + elems_per_wi * sgSize < nelems_) { +#pragma unroll + for (std::uint8_t it = 0; it < elems_per_wi; it += vec_sz) { + const std::size_t offset = base + it * sgSize; + auto in_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>(&in[offset]); + auto out_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>(&out[offset]); + + const sycl::vec arg_vec = + sub_group_load(sg, in_multi_ptr); + sycl::vec res_vec; +#pragma unroll + for (std::uint8_t k = 0; k < vec_sz; ++k) { + res_vec[k] = op(arg_vec[k]); + } + sub_group_store(sg, res_vec, out_multi_ptr); + } + } + else { + const std::size_t lane_id = sg.get_local_id()[0]; + for (std::size_t k = base + lane_id; k < nelems_; k += sgSize) { + out[k] = op(in[k]); + } + } + } + else { + const std::uint16_t sgSize = + ndit.get_sub_group().get_local_range()[0]; + const std::size_t gid = ndit.get_global_linear_id(); + const std::uint16_t elems_per_sg = sgSize * elems_per_wi; + + const std::size_t start = + (gid / sgSize) * (elems_per_sg - sgSize) + gid; + const std::size_t end = std::min(nelems_, start + elems_per_sg); + for (std::size_t offset = start; offset < end; offset += sgSize) { + out[offset] = op(in[offset]); + } + } + } +}; + +template +struct UnaryStridedFunctor +{ +private: + const argT *inp_ = nullptr; + resT *res_ = nullptr; + IndexerT inp_out_indexer_; + +public: + UnaryStridedFunctor(const argT *inp_p, + resT *res_p, + const IndexerT &inp_out_indexer) + : inp_(inp_p), res_(res_p), inp_out_indexer_(inp_out_indexer) + { + } + + void operator()(sycl::id<1> wid) const + { + const auto &offsets_ = inp_out_indexer_(wid.get(0)); + const ssize_t &inp_offset = offsets_.get_first_offset(); + const ssize_t &res_offset = offsets_.get_second_offset(); + + UnaryOpT op{}; + + res_[res_offset] = op(inp_[inp_offset]); + } +}; + +template +SizeT select_lws(const sycl::device &, SizeT n_work_items_needed) +{ + // TODO: make the decision based on device descriptors + + // constexpr SizeT few_threshold = (SizeT(1) << 17); + static constexpr SizeT med_threshold = (SizeT(1) << 21); + + const SizeT lws = + (n_work_items_needed <= med_threshold ? SizeT(128) : SizeT(256)); + + return lws; +} + +template class UnaryOutputType, + template class ContigFunctorT, + template class kernel_name, + std::uint8_t vec_sz = 4u, + std::uint8_t n_vecs = 2u> +sycl::event unary_contig_impl(sycl::queue &exec_q, + std::size_t nelems, + const char *arg_p, + char *res_p, + const std::vector &depends = {}) +{ + static constexpr std::uint8_t elems_per_wi = n_vecs * vec_sz; + const std::size_t n_work_items_needed = nelems / elems_per_wi; + const std::size_t lws = + select_lws(exec_q.get_device(), n_work_items_needed); + + const std::size_t n_groups = + ((nelems + lws * elems_per_wi - 1) / (lws * elems_per_wi)); + const auto gws_range = sycl::range<1>(n_groups * lws); + const auto lws_range = sycl::range<1>(lws); + + using resTy = typename UnaryOutputType::value_type; + using BaseKernelName = kernel_name; + + const argTy *arg_tp = reinterpret_cast(arg_p); + resTy *res_tp = reinterpret_cast(res_p); + + sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + if (is_aligned(arg_p) && + is_aligned(res_p)) { + static constexpr bool enable_sg_loadstore = true; + using KernelName = BaseKernelName; + using Impl = ContigFunctorT; + + cgh.parallel_for( + sycl::nd_range<1>(gws_range, lws_range), + Impl(arg_tp, res_tp, nelems)); + } + else { + static constexpr bool disable_sg_loadstore = false; + using KernelName = + disabled_sg_loadstore_wrapper_krn; + using Impl = ContigFunctorT; + + cgh.parallel_for( + sycl::nd_range<1>(gws_range, lws_range), + Impl(arg_tp, res_tp, nelems)); + } + }); + + return comp_ev; +} + +template class UnaryOutputType, + template class StridedFunctorT, + template class kernel_name> +sycl::event + unary_strided_impl(sycl::queue &exec_q, + std::size_t nelems, + int nd, + const ssize_t *shape_and_strides, + const char *arg_p, + ssize_t arg_offset, + char *res_p, + ssize_t res_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + cgh.depends_on(additional_depends); + + using resTy = typename UnaryOutputType::value_type; + using IndexerT = + typename dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer; + + const IndexerT indexer{nd, arg_offset, res_offset, shape_and_strides}; + + const argTy *arg_tp = reinterpret_cast(arg_p); + resTy *res_tp = reinterpret_cast(res_p); + + using Impl = StridedFunctorT; + + cgh.parallel_for>( + {nelems}, Impl(arg_tp, res_tp, indexer)); + }); + return comp_ev; +} + +template +struct BinaryContigFunctor +{ +private: + const argT1 *in1 = nullptr; + const argT2 *in2 = nullptr; + resT *out = nullptr; + std::size_t nelems_; + +public: + BinaryContigFunctor(const argT1 *inp1, + const argT2 *inp2, + resT *res, + const std::size_t n_elems) + : in1(inp1), in2(inp2), out(res), nelems_(n_elems) + { + } + + void operator()(sycl::nd_item<1> ndit) const + { + static constexpr std::uint8_t elems_per_wi = n_vecs * vec_sz; + BinaryOperatorT op{}; + /* Each work-item processes vec_sz elements, contiguous in memory */ + /* NOTE: work-group size must be divisible by sub-group size */ + + if constexpr (enable_sg_loadstore && + BinaryOperatorT::supports_sg_loadstore::value && + BinaryOperatorT::supports_vec::value && (vec_sz > 1)) { + auto sg = ndit.get_sub_group(); + std::uint16_t sgSize = sg.get_max_local_range()[0]; + + const std::size_t base = + elems_per_wi * (ndit.get_group(0) * ndit.get_local_range(0) + + sg.get_group_id()[0] * sgSize); + + if (base + elems_per_wi * sgSize < nelems_) { + sycl::vec res_vec; + +#pragma unroll + for (std::uint8_t it = 0; it < elems_per_wi; it += vec_sz) { + std::size_t offset = base + it * sgSize; + auto in1_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>(&in1[offset]); + auto in2_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>(&in2[offset]); + auto out_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>(&out[offset]); + + const sycl::vec arg1_vec = + sub_group_load(sg, in1_multi_ptr); + const sycl::vec arg2_vec = + sub_group_load(sg, in2_multi_ptr); + res_vec = op(arg1_vec, arg2_vec); + sub_group_store(sg, res_vec, out_multi_ptr); + } + } + else { + const std::size_t lane_id = sg.get_local_id()[0]; + for (std::size_t k = base + lane_id; k < nelems_; k += sgSize) { + out[k] = op(in1[k], in2[k]); + } + } + } + else if constexpr (enable_sg_loadstore && + BinaryOperatorT::supports_sg_loadstore::value) { + auto sg = ndit.get_sub_group(); + const std::uint16_t sgSize = sg.get_max_local_range()[0]; + + const std::size_t base = + elems_per_wi * (ndit.get_group(0) * ndit.get_local_range(0) + + sg.get_group_id()[0] * sgSize); + + if (base + elems_per_wi * sgSize < nelems_) { +#pragma unroll + for (std::uint8_t it = 0; it < elems_per_wi; it += vec_sz) { + const std::size_t offset = base + it * sgSize; + auto in1_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>(&in1[offset]); + auto in2_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>(&in2[offset]); + auto out_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>(&out[offset]); + + const sycl::vec arg1_vec = + sub_group_load(sg, in1_multi_ptr); + const sycl::vec arg2_vec = + sub_group_load(sg, in2_multi_ptr); + + sycl::vec res_vec; +#pragma unroll + for (std::uint8_t vec_id = 0; vec_id < vec_sz; ++vec_id) { + res_vec[vec_id] = + op(arg1_vec[vec_id], arg2_vec[vec_id]); + } + sub_group_store(sg, res_vec, out_multi_ptr); + } + } + else { + const std::size_t lane_id = sg.get_local_id()[0]; + for (std::size_t k = base + lane_id; k < nelems_; k += sgSize) { + out[k] = op(in1[k], in2[k]); + } + } + } + else { + const std::size_t sgSize = + ndit.get_sub_group().get_local_range()[0]; + const std::size_t gid = ndit.get_global_linear_id(); + const std::size_t elems_per_sg = sgSize * elems_per_wi; + + const std::size_t start = + (gid / sgSize) * (elems_per_sg - sgSize) + gid; + const std::size_t end = std::min(nelems_, start + elems_per_sg); + for (std::size_t offset = start; offset < end; offset += sgSize) { + out[offset] = op(in1[offset], in2[offset]); + } + } + } +}; + +template +struct BinaryStridedFunctor +{ +private: + const argT1 *in1 = nullptr; + const argT2 *in2 = nullptr; + resT *out = nullptr; + ThreeOffsets_IndexerT three_offsets_indexer_; + +public: + BinaryStridedFunctor(const argT1 *inp1_tp, + const argT2 *inp2_tp, + resT *res_tp, + const ThreeOffsets_IndexerT &inps_res_indexer) + : in1(inp1_tp), in2(inp2_tp), out(res_tp), + three_offsets_indexer_(inps_res_indexer) + { + } + + void operator()(sycl::id<1> wid) const + { + const auto &three_offsets_ = + three_offsets_indexer_(static_cast(wid.get(0))); + + const auto &inp1_offset = three_offsets_.get_first_offset(); + const auto &inp2_offset = three_offsets_.get_second_offset(); + const auto &out_offset = three_offsets_.get_third_offset(); + + BinaryOperatorT op{}; + out[out_offset] = op(in1[inp1_offset], in2[inp2_offset]); + } +}; + +template +struct BinaryContigMatrixContigRowBroadcastingFunctor +{ +private: + const argT1 *mat; + const argT2 *padded_vec; + resT *res; + std::size_t n_elems; + std::size_t n1; + +public: + BinaryContigMatrixContigRowBroadcastingFunctor(const argT1 *mat_tp, + const argT2 *row_tp, + resT *res_tp, + std::size_t n_elems_in_mat, + std::size_t n_elems_in_row) + : mat(mat_tp), padded_vec(row_tp), res(res_tp), n_elems(n_elems_in_mat), + n1(n_elems_in_row) + { + } + + void operator()(sycl::nd_item<1> ndit) const + { + /* NOTE: work-group size must be divisible by sub-group size */ + + BinaryOperatorT op{}; + static_assert(BinaryOperatorT::supports_sg_loadstore::value); + + const auto &sg = ndit.get_sub_group(); + const std::size_t gid = ndit.get_global_linear_id(); + + const std::size_t sgSize = sg.get_max_local_range()[0]; + const std::size_t base = gid - sg.get_local_id()[0]; + + if (base + sgSize < n_elems) { + auto in1_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>(&mat[base]); + + auto in2_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>(&padded_vec[base % n1]); + + auto out_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>(&res[base]); + + const argT1 mat_el = sub_group_load(sg, in1_multi_ptr); + const argT2 vec_el = sub_group_load(sg, in2_multi_ptr); + + resT res_el = op(mat_el, vec_el); + + sub_group_store(sg, res_el, out_multi_ptr); + } + else { + const std::size_t lane_id = sg.get_local_id()[0]; + for (std::size_t k = base + lane_id; k < n_elems; k += sgSize) { + res[k] = op(mat[k], padded_vec[k % n1]); + } + } + } +}; + +template +struct BinaryContigRowContigMatrixBroadcastingFunctor +{ +private: + const argT1 *padded_vec; + const argT2 *mat; + resT *res; + std::size_t n_elems; + std::size_t n1; + +public: + BinaryContigRowContigMatrixBroadcastingFunctor(const argT1 *row_tp, + const argT2 *mat_tp, + resT *res_tp, + std::size_t n_elems_in_mat, + std::size_t n_elems_in_row) + : padded_vec(row_tp), mat(mat_tp), res(res_tp), n_elems(n_elems_in_mat), + n1(n_elems_in_row) + { + } + + void operator()(sycl::nd_item<1> ndit) const + { + /* NOTE: work-group size must be divisible by sub-group size */ + BinaryOperatorT op{}; + static_assert(BinaryOperatorT::supports_sg_loadstore::value); + + const auto &sg = ndit.get_sub_group(); + std::size_t gid = ndit.get_global_linear_id(); + + const std::size_t sgSize = sg.get_max_local_range()[0]; + const std::size_t base = gid - sg.get_local_id()[0]; + + if (base + sgSize < n_elems) { + auto in1_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>(&padded_vec[base % n1]); + + auto in2_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>(&mat[base]); + + auto out_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>(&res[base]); + + const argT2 mat_el = sub_group_load(sg, in2_multi_ptr); + const argT1 vec_el = sub_group_load(sg, in1_multi_ptr); + + resT res_el = op(vec_el, mat_el); + + sub_group_store(sg, res_el, out_multi_ptr); + } + else { + const std::size_t lane_id = sg.get_local_id()[0]; + for (std::size_t k = base + lane_id; k < n_elems; k += sgSize) { + res[k] = op(padded_vec[k % n1], mat[k]); + } + } + } +}; + +// Typedefs for function pointers + +typedef sycl::event (*unary_contig_impl_fn_ptr_t)( + sycl::queue &, + std::size_t, + const char *, + char *, + const std::vector &); + +typedef sycl::event (*unary_strided_impl_fn_ptr_t)( + sycl::queue &, + std::size_t, + int, + const ssize_t *, + const char *, + ssize_t, + char *, + ssize_t, + const std::vector &, + const std::vector &); + +typedef sycl::event (*binary_contig_impl_fn_ptr_t)( + sycl::queue &, + std::size_t, + const char *, + ssize_t, + const char *, + ssize_t, + char *, + ssize_t, + const std::vector &); + +typedef sycl::event (*binary_strided_impl_fn_ptr_t)( + sycl::queue &, + std::size_t, + int, + const ssize_t *, + const char *, + ssize_t, + const char *, + ssize_t, + char *, + ssize_t, + const std::vector &, + const std::vector &); + +typedef sycl::event (*binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t)( + sycl::queue &, + std::vector &, + std::size_t, + std::size_t, + const char *, + ssize_t, + const char *, + ssize_t, + char *, + ssize_t, + const std::vector &); + +typedef sycl::event (*binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t)( + sycl::queue &, + std::vector &, + std::size_t, + std::size_t, + const char *, + ssize_t, + const char *, + ssize_t, + char *, + ssize_t, + const std::vector &); + +template class BinaryOutputType, + template class BinaryContigFunctorT, + template class kernel_name, + std::uint8_t vec_sz = 4u, + std::uint8_t n_vecs = 2u> +sycl::event binary_contig_impl(sycl::queue &exec_q, + std::size_t nelems, + const char *arg1_p, + ssize_t arg1_offset, + const char *arg2_p, + ssize_t arg2_offset, + char *res_p, + ssize_t res_offset, + const std::vector &depends = {}) +{ + const std::size_t n_work_items_needed = nelems / (n_vecs * vec_sz); + const std::size_t lws = + select_lws(exec_q.get_device(), n_work_items_needed); + + const std::size_t n_groups = + ((nelems + lws * n_vecs * vec_sz - 1) / (lws * n_vecs * vec_sz)); + const auto gws_range = sycl::range<1>(n_groups * lws); + const auto lws_range = sycl::range<1>(lws); + + using resTy = typename BinaryOutputType::value_type; + using BaseKernelName = kernel_name; + + const argTy1 *arg1_tp = + reinterpret_cast(arg1_p) + arg1_offset; + const argTy2 *arg2_tp = + reinterpret_cast(arg2_p) + arg2_offset; + resTy *res_tp = reinterpret_cast(res_p) + res_offset; + + sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + if (is_aligned(arg1_tp) && + is_aligned(arg2_tp) && + is_aligned(res_tp)) { + static constexpr bool enable_sg_loadstore = true; + using KernelName = BaseKernelName; + using Impl = BinaryContigFunctorT; + + cgh.parallel_for( + sycl::nd_range<1>(gws_range, lws_range), + Impl(arg1_tp, arg2_tp, res_tp, nelems)); + } + else { + static constexpr bool disable_sg_loadstore = false; + using KernelName = + disabled_sg_loadstore_wrapper_krn; + using Impl = BinaryContigFunctorT; + + cgh.parallel_for( + sycl::nd_range<1>(gws_range, lws_range), + Impl(arg1_tp, arg2_tp, res_tp, nelems)); + } + }); + return comp_ev; +} + +template class BinaryOutputType, + template class BinaryStridedFunctorT, + template class kernel_name> +sycl::event + binary_strided_impl(sycl::queue &exec_q, + std::size_t nelems, + int nd, + const ssize_t *shape_and_strides, + const char *arg1_p, + ssize_t arg1_offset, + const char *arg2_p, + ssize_t arg2_offset, + char *res_p, + ssize_t res_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + cgh.depends_on(additional_depends); + + using resTy = typename BinaryOutputType::value_type; + + using IndexerT = + typename dpctl::tensor::offset_utils::ThreeOffsets_StridedIndexer; + + const IndexerT indexer{nd, arg1_offset, arg2_offset, res_offset, + shape_and_strides}; + + const argTy1 *arg1_tp = reinterpret_cast(arg1_p); + const argTy2 *arg2_tp = reinterpret_cast(arg2_p); + resTy *res_tp = reinterpret_cast(res_p); + + using Impl = BinaryStridedFunctorT; + + cgh.parallel_for>( + {nelems}, Impl(arg1_tp, arg2_tp, res_tp, indexer)); + }); + return comp_ev; +} + +template < + typename argT1, + typename argT2, + typename resT, + template class BinaryContigMatrixContigRowBroadcastFunctorT, + template class kernel_name> +sycl::event binary_contig_matrix_contig_row_broadcast_impl( + sycl::queue &exec_q, + std::vector &host_tasks, + std::size_t n0, + std::size_t n1, + const char *mat_p, // typeless pointer to (n0, n1) C-contiguous matrix + ssize_t mat_offset, + const char *vec_p, // typeless pointer to (n1,) contiguous row + ssize_t vec_offset, + char *res_p, // typeless pointer to (n0, n1) result C-contig. matrix, + // res[i,j] = op(mat[i,j], vec[j]) + ssize_t res_offset, + const std::vector &depends = {}) +{ + const argT1 *mat = reinterpret_cast(mat_p) + mat_offset; + const argT2 *vec = reinterpret_cast(vec_p) + vec_offset; + resT *res = reinterpret_cast(res_p) + res_offset; + + const auto &dev = exec_q.get_device(); + const auto &sg_sizes = dev.get_info(); + // Get device-specific kernel info max_sub_group_size + std::size_t max_sgSize = + *(std::max_element(std::begin(sg_sizes), std::end(sg_sizes))); + + std::size_t n1_padded = n1 + max_sgSize; + auto padded_vec_owner = + dpctl::tensor::alloc_utils::smart_malloc_device(n1_padded, + exec_q); + argT2 *padded_vec = padded_vec_owner.get(); + + sycl::event make_padded_vec_ev = + dpctl::tensor::kernels::elementwise_detail::populate_padded_vector< + argT2>(exec_q, vec, n1, padded_vec, n1_padded, depends); + + // sub-group spans work-items [I, I + sgSize) + // base = ndit.get_global_linear_id() - sg.get_local_id()[0] + // Generically, sub_group_load( &mat[base]) may load arrays from + // different rows of mat. The start corresponds to row (base / n0) + // We read sub_group_load(&padded_vec[(base / n0)]). + // The vector is padded to ensure that reads are accessible + + const std::size_t lws = 128; + + sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(make_padded_vec_ev); + + auto lwsRange = sycl::range<1>(lws); + std::size_t n_elems = n0 * n1; + std::size_t n_groups = (n_elems + lws - 1) / lws; + auto gwsRange = sycl::range<1>(n_groups * lws); + + using Impl = + BinaryContigMatrixContigRowBroadcastFunctorT; + + cgh.parallel_for>( + sycl::nd_range<1>(gwsRange, lwsRange), + Impl(mat, padded_vec, res, n_elems, n1)); + }); + + sycl::event tmp_cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {comp_ev}, padded_vec_owner); + + host_tasks.push_back(tmp_cleanup_ev); + + return comp_ev; +} + +template < + typename argT1, + typename argT2, + typename resT, + template class BinaryContigRowContigMatrixBroadcastFunctorT, + template class kernel_name> +sycl::event binary_contig_row_contig_matrix_broadcast_impl( + sycl::queue &exec_q, + std::vector &host_tasks, + std::size_t n0, + std::size_t n1, + const char *vec_p, // typeless pointer to (n1,) contiguous row + ssize_t vec_offset, + const char *mat_p, // typeless pointer to (n0, n1) C-contiguous matrix + ssize_t mat_offset, + char *res_p, // typeless pointer to (n0, n1) result C-contig. matrix, + // res[i,j] = op(vec[j], mat[i,j]) + ssize_t res_offset, + const std::vector &depends = {}) +{ + const argT1 *vec = reinterpret_cast(vec_p) + vec_offset; + const argT2 *mat = reinterpret_cast(mat_p) + mat_offset; + resT *res = reinterpret_cast(res_p) + res_offset; + + const auto &dev = exec_q.get_device(); + const auto &sg_sizes = dev.get_info(); + // Get device-specific kernel info max_sub_group_size + std::size_t max_sgSize = + *(std::max_element(std::begin(sg_sizes), std::end(sg_sizes))); + + std::size_t n1_padded = n1 + max_sgSize; + auto padded_vec_owner = + dpctl::tensor::alloc_utils::smart_malloc_device(n1_padded, + exec_q); + argT2 *padded_vec = padded_vec_owner.get(); + + sycl::event make_padded_vec_ev = + dpctl::tensor::kernels::elementwise_detail::populate_padded_vector< + argT2>(exec_q, vec, n1, padded_vec, n1_padded, depends); + + // sub-group spans work-items [I, I + sgSize) + // base = ndit.get_global_linear_id() - sg.get_local_id()[0] + // Generically, sub_group_load( &mat[base]) may load arrays from + // different rows of mat. The start corresponds to row (base / n0) + // We read sub_group_load(&padded_vec[(base / n0)]). The vector is + // padded to ensure that reads are accessible + + const std::size_t lws = 128; + + sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(make_padded_vec_ev); + + auto lwsRange = sycl::range<1>(lws); + std::size_t n_elems = n0 * n1; + std::size_t n_groups = (n_elems + lws - 1) / lws; + auto gwsRange = sycl::range<1>(n_groups * lws); + + using Impl = + BinaryContigRowContigMatrixBroadcastFunctorT; + + cgh.parallel_for>( + sycl::nd_range<1>(gwsRange, lwsRange), + Impl(padded_vec, mat, res, n_elems, n1)); + }); + + sycl::event tmp_cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {comp_ev}, padded_vec_owner); + + host_tasks.push_back(tmp_cleanup_ev); + + return comp_ev; +}; +} // namespace dpctl::tensor::kernels::elementwise_common diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/common_detail.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/common_detail.hpp new file mode 100644 index 000000000000..68d025ec6307 --- /dev/null +++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/common_detail.hpp @@ -0,0 +1,69 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +/// +/// \file +/// This file defines common code for elementwise tensor operations. +//===---------------------------------------------------------------------===// + +#pragma once + +#include +#include + +#include + +namespace dpctl::tensor::kernels::elementwise_detail +{ +template +class populate_padded_vec_krn; + +template +sycl::event + populate_padded_vector(sycl::queue &exec_q, + const T *vec, + std::size_t vec_sz, + T *padded_vec, + size_t padded_vec_sz, + const std::vector &dependent_events) +{ + sycl::event populate_padded_vec_ev = exec_q.submit([&](sycl::handler &cgh) { + // ensure vec contains actual data + cgh.depends_on(dependent_events); + + sycl::range<1> gRange{padded_vec_sz}; + + cgh.parallel_for>( + gRange, [=](sycl::id<1> id) { + std::size_t i = id[0]; + padded_vec[i] = vec[i % vec_sz]; + }); + }); + + return populate_padded_vec_ev; +} +} // namespace dpctl::tensor::kernels::elementwise_detail diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp new file mode 100644 index 000000000000..61902fce888a --- /dev/null +++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp @@ -0,0 +1,476 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines common code for in-place elementwise tensor operations. +//===---------------------------------------------------------------------===// + +#pragma once +#include +#include +#include +#include +#include + +#include + +#include "utils/offset_utils.hpp" +#include "utils/sycl_alloc_utils.hpp" +#include "utils/sycl_utils.hpp" + +#include "kernels/alignment.hpp" +#include "kernels/dpctl_tensor_types.hpp" +#include "kernels/elementwise_functions/common_detail.hpp" + +namespace dpctl::tensor::kernels::elementwise_common +{ + +using dpctl::tensor::ssize_t; +using dpctl::tensor::kernels::alignment_utils:: + disabled_sg_loadstore_wrapper_krn; +using dpctl::tensor::kernels::alignment_utils::is_aligned; +using dpctl::tensor::kernels::alignment_utils::required_alignment; + +using dpctl::tensor::sycl_utils::sub_group_load; +using dpctl::tensor::sycl_utils::sub_group_store; + +template +struct BinaryInplaceContigFunctor +{ +private: + const argT *rhs = nullptr; + resT *lhs = nullptr; + std::size_t nelems_; + +public: + BinaryInplaceContigFunctor(const argT *rhs_tp, + resT *lhs_tp, + const std::size_t n_elems) + : rhs(rhs_tp), lhs(lhs_tp), nelems_(n_elems) + { + } + + void operator()(sycl::nd_item<1> ndit) const + { + BinaryInplaceOperatorT op{}; + static constexpr std::uint8_t elems_per_wi = vec_sz * n_vecs; + /* Each work-item processes vec_sz elements, contiguous in memory */ + /* NB: Workgroup size must be divisible by sub-group size */ + + if constexpr (enable_sg_loadstore && + BinaryInplaceOperatorT::supports_sg_loadstore::value && + BinaryInplaceOperatorT::supports_vec::value && + (vec_sz > 1)) { + auto sg = ndit.get_sub_group(); + std::uint16_t sgSize = sg.get_max_local_range()[0]; + + std::size_t base = + elems_per_wi * (ndit.get_group(0) * ndit.get_local_range(0) + + sg.get_group_id()[0] * sgSize); + + if (base + elems_per_wi * sgSize < nelems_) { + +#pragma unroll + for (std::uint8_t it = 0; it < elems_per_wi; it += vec_sz) { + const std::size_t offset = base + it * sgSize; + auto rhs_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>(&rhs[offset]); + auto lhs_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>(&lhs[offset]); + + const sycl::vec &arg_vec = + sub_group_load(sg, rhs_multi_ptr); + sycl::vec res_vec = + sub_group_load(sg, lhs_multi_ptr); + op(res_vec, arg_vec); + + sub_group_store(sg, res_vec, lhs_multi_ptr); + } + } + else { + const std::size_t lane_id = sg.get_local_id()[0]; + for (std::size_t k = base + lane_id; k < nelems_; k += sgSize) { + op(lhs[k], rhs[k]); + } + } + } + else if constexpr (enable_sg_loadstore && + BinaryInplaceOperatorT::supports_sg_loadstore:: + value) { + auto sg = ndit.get_sub_group(); + std::uint16_t sgSize = sg.get_max_local_range()[0]; + + std::size_t base = + elems_per_wi * (ndit.get_group(0) * ndit.get_local_range(0) + + sg.get_group_id()[0] * sgSize); + + if (base + elems_per_wi * sgSize < nelems_) { +#pragma unroll + for (std::uint8_t it = 0; it < elems_per_wi; it += vec_sz) { + const std::size_t offset = base + it * sgSize; + auto rhs_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>(&rhs[offset]); + auto lhs_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>(&lhs[offset]); + + const sycl::vec arg_vec = + sub_group_load(sg, rhs_multi_ptr); + sycl::vec res_vec = + sub_group_load(sg, lhs_multi_ptr); +#pragma unroll + for (std::uint8_t vec_id = 0; vec_id < vec_sz; ++vec_id) { + op(res_vec[vec_id], arg_vec[vec_id]); + } + sub_group_store(sg, res_vec, lhs_multi_ptr); + } + } + else { + const std::size_t lane_id = sg.get_local_id()[0]; + for (std::size_t k = base + lane_id; k < nelems_; k += sgSize) { + op(lhs[k], rhs[k]); + } + } + } + else { + const std::size_t sgSize = + ndit.get_sub_group().get_local_range()[0]; + const std::size_t gid = ndit.get_global_linear_id(); + const std::size_t elems_per_sg = elems_per_wi * sgSize; + + const std::size_t start = + (gid / sgSize) * (elems_per_sg - sgSize) + gid; + const std::size_t end = std::min(nelems_, start + elems_per_sg); + for (std::size_t offset = start; offset < end; offset += sgSize) { + op(lhs[offset], rhs[offset]); + } + } + } +}; + +template +struct BinaryInplaceStridedFunctor +{ +private: + const argT *rhs = nullptr; + resT *lhs = nullptr; + TwoOffsets_IndexerT two_offsets_indexer_; + +public: + BinaryInplaceStridedFunctor(const argT *rhs_tp, + resT *lhs_tp, + const TwoOffsets_IndexerT &inp_res_indexer) + : rhs(rhs_tp), lhs(lhs_tp), two_offsets_indexer_(inp_res_indexer) + { + } + + void operator()(sycl::id<1> wid) const + { + const auto &two_offsets_ = + two_offsets_indexer_(static_cast(wid.get(0))); + + const auto &inp_offset = two_offsets_.get_first_offset(); + const auto &lhs_offset = two_offsets_.get_second_offset(); + + BinaryInplaceOperatorT op{}; + op(lhs[lhs_offset], rhs[inp_offset]); + } +}; + +template +struct BinaryInplaceRowMatrixBroadcastingFunctor +{ +private: + const argT *padded_vec; + resT *mat; + std::size_t n_elems; + std::size_t n1; + +public: + BinaryInplaceRowMatrixBroadcastingFunctor(const argT *row_tp, + resT *mat_tp, + std::size_t n_elems_in_mat, + std::size_t n_elems_in_row) + : padded_vec(row_tp), mat(mat_tp), n_elems(n_elems_in_mat), + n1(n_elems_in_row) + { + } + + void operator()(sycl::nd_item<1> ndit) const + { + /* Workgroup size is expected to be a multiple of sub-group size */ + BinaryOperatorT op{}; + static_assert(BinaryOperatorT::supports_sg_loadstore::value); + + auto sg = ndit.get_sub_group(); + const std::size_t gid = ndit.get_global_linear_id(); + + std::uint8_t sgSize = sg.get_max_local_range()[0]; + std::size_t base = gid - sg.get_local_id()[0]; + + if (base + sgSize < n_elems) { + auto in_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>(&padded_vec[base % n1]); + + auto out_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>(&mat[base]); + + const argT vec_el = sub_group_load(sg, in_multi_ptr); + resT mat_el = sub_group_load(sg, out_multi_ptr); + + op(mat_el, vec_el); + + sub_group_store(sg, mat_el, out_multi_ptr); + } + else { + const std::size_t start = base + sg.get_local_id()[0]; + for (std::size_t k = start; k < n_elems; k += sgSize) { + op(mat[k], padded_vec[k % n1]); + } + } + } +}; + +// Typedefs for function pointers + +typedef sycl::event (*binary_inplace_contig_impl_fn_ptr_t)( + sycl::queue &, + std::size_t, + const char *, + ssize_t, + char *, + ssize_t, + const std::vector &); + +typedef sycl::event (*binary_inplace_strided_impl_fn_ptr_t)( + sycl::queue &, + std::size_t, + int, + const ssize_t *, + const char *, + ssize_t, + char *, + ssize_t, + const std::vector &, + const std::vector &); + +typedef sycl::event (*binary_inplace_row_matrix_broadcast_impl_fn_ptr_t)( + sycl::queue &, + std::vector &, + std::size_t, + std::size_t, + const char *, + ssize_t, + char *, + ssize_t, + const std::vector &); + +template class BinaryInplaceContigFunctorT, + template class kernel_name, + std::uint8_t vec_sz = 4u, + std::uint8_t n_vecs = 2u> +sycl::event + binary_inplace_contig_impl(sycl::queue &exec_q, + std::size_t nelems, + const char *rhs_p, + ssize_t rhs_offset, + char *lhs_p, + ssize_t lhs_offset, + const std::vector &depends = {}) +{ + sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + const std::size_t lws = 128; + const std::size_t n_groups = + ((nelems + lws * n_vecs * vec_sz - 1) / (lws * n_vecs * vec_sz)); + const auto gws_range = sycl::range<1>(n_groups * lws); + const auto lws_range = sycl::range<1>(lws); + + const argTy *arg_tp = + reinterpret_cast(rhs_p) + rhs_offset; + resTy *res_tp = reinterpret_cast(lhs_p) + lhs_offset; + + if (is_aligned(arg_tp) && + is_aligned(res_tp)) { + static constexpr bool enable_sg_loadstore = true; + using KernelName = kernel_name; + using Impl = + BinaryInplaceContigFunctorT; + + cgh.parallel_for( + sycl::nd_range<1>(gws_range, lws_range), + Impl(arg_tp, res_tp, nelems)); + } + else { + static constexpr bool disable_sg_loadstore = true; + using InnerKernelName = kernel_name; + using KernelName = + disabled_sg_loadstore_wrapper_krn; + using Impl = + BinaryInplaceContigFunctorT; + + cgh.parallel_for( + sycl::nd_range<1>(gws_range, lws_range), + Impl(arg_tp, res_tp, nelems)); + } + }); + return comp_ev; +} + +template class BinaryInplaceStridedFunctorT, + template class kernel_name> +sycl::event binary_inplace_strided_impl( + sycl::queue &exec_q, + std::size_t nelems, + int nd, + const ssize_t *shape_and_strides, + const char *rhs_p, + ssize_t rhs_offset, + char *lhs_p, + ssize_t lhs_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + cgh.depends_on(additional_depends); + + using IndexerT = + typename dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer; + + const IndexerT indexer{nd, rhs_offset, lhs_offset, shape_and_strides}; + + const argTy *arg_tp = reinterpret_cast(rhs_p); + resTy *res_tp = reinterpret_cast(lhs_p); + + using Impl = BinaryInplaceStridedFunctorT; + + cgh.parallel_for>( + {nelems}, Impl(arg_tp, res_tp, indexer)); + }); + return comp_ev; +} + +template class BinaryInplaceRowMatrixBroadcastFunctorT, + template class kernel_name> +sycl::event binary_inplace_row_matrix_broadcast_impl( + sycl::queue &exec_q, + std::vector &host_tasks, + std::size_t n0, + std::size_t n1, + const char *vec_p, // typeless pointer to (n1,) contiguous row + ssize_t vec_offset, + char *mat_p, // typeless pointer to (n0, n1) C-contiguous matrix + ssize_t mat_offset, + const std::vector &depends = {}) +{ + const argT *vec = reinterpret_cast(vec_p) + vec_offset; + resT *mat = reinterpret_cast(mat_p) + mat_offset; + + const auto &dev = exec_q.get_device(); + const auto &sg_sizes = dev.get_info(); + // Get device-specific kernel info max_sub_group_size + std::size_t max_sgSize = + *(std::max_element(std::begin(sg_sizes), std::end(sg_sizes))); + + std::size_t n1_padded = n1 + max_sgSize; + auto padded_vec_owner = + dpctl::tensor::alloc_utils::smart_malloc_device(n1_padded, + exec_q); + argT *padded_vec = padded_vec_owner.get(); + + sycl::event make_padded_vec_ev = + dpctl::tensor::kernels::elementwise_detail::populate_padded_vector< + argT>(exec_q, vec, n1, padded_vec, n1_padded, depends); + + // sub-group spans work-items [I, I + sgSize) + // base = ndit.get_global_linear_id() - sg.get_local_id()[0] + // Generically, sub_group_load( &mat[base]) may load arrays from + // different rows of mat. The start corresponds to row (base / n0) + // We read sub_group_load(&padded_vec[(base / n0)]). The vector is + // padded to ensure that reads are accessible + + const std::size_t lws = 128; + + sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(make_padded_vec_ev); + + auto lwsRange = sycl::range<1>(lws); + std::size_t n_elems = n0 * n1; + std::size_t n_groups = (n_elems + lws - 1) / lws; + auto gwsRange = sycl::range<1>(n_groups * lws); + + using Impl = BinaryInplaceRowMatrixBroadcastFunctorT; + + cgh.parallel_for>( + sycl::nd_range<1>(gwsRange, lwsRange), + Impl(padded_vec, mat, n_elems, n1)); + }); + + sycl::event tmp_cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {comp_ev}, padded_vec_owner); + host_tasks.push_back(tmp_cleanup_ev); + + return comp_ev; +} + +} // namespace dpctl::tensor::kernels::elementwise_common diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/conj.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/conj.hpp new file mode 100644 index 000000000000..2c965b236c87 --- /dev/null +++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/conj.hpp @@ -0,0 +1,234 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for elementwise evaluation of CONJ(x) function. +//===---------------------------------------------------------------------===// + +#pragma once +#include +#include +#include +#include +#include + +#include + +#include "sycl_complex.hpp" +#include "vec_size_util.hpp" + +#include "kernels/dpctl_tensor_types.hpp" +#include "kernels/elementwise_functions/common.hpp" + +#include "utils/type_dispatch_building.hpp" +#include "utils/type_utils.hpp" + +namespace dpctl::tensor::kernels::conj +{ + +using dpctl::tensor::ssize_t; +namespace td_ns = dpctl::tensor::type_dispatch; + +using dpctl::tensor::type_utils::is_complex; + +template +struct ConjFunctor +{ + + // is function constant for given argT + using is_constant = typename std::false_type; + // constant value, if constant + // constexpr resT constant_value = resT{}; + // is function defined for sycl::vec + using supports_vec = typename std::false_type; + // do both argTy and resTy support sugroup store/load operation + using supports_sg_loadstore = typename std::negation< + std::disjunction, is_complex>>; + + resT operator()(const argT &in) const + { + if constexpr (is_complex::value) { + using rT = typename argT::value_type; + + return exprm_ns::conj(exprm_ns::complex(in)); // conj(in); + } + else { + if constexpr (!std::is_same_v) + static_assert(std::is_same_v); + return in; + } + } +}; + +template +using ConjContigFunctor = + elementwise_common::UnaryContigFunctor, + vec_sz, + n_vecs, + enable_sg_loadstore>; + +template +using ConjStridedFunctor = elementwise_common:: + UnaryStridedFunctor>; + +template +struct ConjOutputType +{ + using value_type = typename std::disjunction< + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry>, + td_ns::TypeMapResultEntry>, + td_ns::DefaultResultEntry>::result_type; + + static constexpr bool is_defined = !std::is_same_v; +}; + +namespace hyperparam_detail +{ + +namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; + +using vsu_ns::ContigHyperparameterSetDefault; +using vsu_ns::UnaryContigHyperparameterSetEntry; + +template +struct ConjContigHyperparameterSet +{ + using value_type = + typename std::disjunction>; + + constexpr static auto vec_sz = value_type::vec_sz; + constexpr static auto n_vecs = value_type::n_vecs; +}; + +} // end of namespace hyperparam_detail + +template +class conj_contig_kernel; + +template +sycl::event conj_contig_impl(sycl::queue &exec_q, + std::size_t nelems, + const char *arg_p, + char *res_p, + const std::vector &depends = {}) +{ + using ConjHS = hyperparam_detail::ConjContigHyperparameterSet; + static constexpr std::uint8_t vec_sz = ConjHS::vec_sz; + static constexpr std::uint8_t n_vecs = ConjHS::n_vecs; + + return elementwise_common::unary_contig_impl< + argTy, ConjOutputType, ConjContigFunctor, conj_contig_kernel, vec_sz, + n_vecs>(exec_q, nelems, arg_p, res_p, depends); +} + +template +struct ConjContigFactory +{ + fnT get() + { + if constexpr (!ConjOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = conj_contig_impl; + return fn; + } + } +}; + +template +struct ConjTypeMapFactory +{ + /*! @brief get typeid for output type of std::conj(T x) */ + std::enable_if_t::value, int> get() + { + using rT = typename ConjOutputType::value_type; + return td_ns::GetTypeid{}.get(); + } +}; + +template +class conj_strided_kernel; + +template +sycl::event + conj_strided_impl(sycl::queue &exec_q, + std::size_t nelems, + int nd, + const ssize_t *shape_and_strides, + const char *arg_p, + ssize_t arg_offset, + char *res_p, + ssize_t res_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + return elementwise_common::unary_strided_impl< + argTy, ConjOutputType, ConjStridedFunctor, conj_strided_kernel>( + exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p, + res_offset, depends, additional_depends); +} + +template +struct ConjStridedFactory +{ + fnT get() + { + if constexpr (!ConjOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = conj_strided_impl; + return fn; + } + } +}; + +} // namespace dpctl::tensor::kernels::conj diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/copysign.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/copysign.hpp new file mode 100644 index 000000000000..c2eb0f7e850e --- /dev/null +++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/copysign.hpp @@ -0,0 +1,248 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for elementwise evaluation of COPYSIGN(x1, x2) +/// function. +//===---------------------------------------------------------------------===// + +#pragma once +#include +#include +#include +#include + +#include + +#include "vec_size_util.hpp" + +#include "utils/offset_utils.hpp" +#include "utils/type_dispatch_building.hpp" +#include "utils/type_utils.hpp" + +#include "kernels/dpctl_tensor_types.hpp" +#include "kernels/elementwise_functions/common.hpp" + +namespace dpctl::tensor::kernels::copysign +{ + +using dpctl::tensor::ssize_t; +namespace td_ns = dpctl::tensor::type_dispatch; +namespace tu_ns = dpctl::tensor::type_utils; + +template +struct CopysignFunctor +{ + + using supports_sg_loadstore = std::true_type; + using supports_vec = std::true_type; + + resT operator()(const argT1 &in1, const argT2 &in2) const + { + return sycl::copysign(in1, in2); + } + + template + sycl::vec + operator()(const sycl::vec &in1, + const sycl::vec &in2) const + { + auto tmp = sycl::copysign(in1, in2); + if constexpr (std::is_same_v) { + return tmp; + } + else { + using dpctl::tensor::type_utils::vec_cast; + + return vec_cast( + tmp); + } + } +}; + +template +using CopysignContigFunctor = + elementwise_common::BinaryContigFunctor, + vec_sz, + n_vecs, + enable_sg_loadstore>; + +template +using CopysignStridedFunctor = elementwise_common::BinaryStridedFunctor< + argT1, + argT2, + resT, + IndexerT, + CopysignFunctor>; + +template +struct CopysignOutputType +{ + using value_type = typename std::disjunction< + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::DefaultResultEntry>::result_type; + + static constexpr bool is_defined = !std::is_same_v; +}; + +namespace hyperparam_detail +{ + +namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; + +using vsu_ns::BinaryContigHyperparameterSetEntry; +using vsu_ns::ContigHyperparameterSetDefault; + +template +struct CopysignContigHyperparameterSet +{ + using value_type = + typename std::disjunction>; + + constexpr static auto vec_sz = value_type::vec_sz; + constexpr static auto n_vecs = value_type::n_vecs; +}; + +} // end of namespace hyperparam_detail + +template +class copysign_contig_kernel; + +template +sycl::event copysign_contig_impl(sycl::queue &exec_q, + std::size_t nelems, + const char *arg1_p, + ssize_t arg1_offset, + const char *arg2_p, + ssize_t arg2_offset, + char *res_p, + ssize_t res_offset, + const std::vector &depends = {}) +{ + using CopySignHS = + hyperparam_detail::CopysignContigHyperparameterSet; + static constexpr std::uint8_t vec_sz = CopySignHS::vec_sz; + static constexpr std::uint8_t n_vecs = CopySignHS::n_vecs; + + return elementwise_common::binary_contig_impl< + argTy1, argTy2, CopysignOutputType, CopysignContigFunctor, + copysign_contig_kernel, vec_sz, n_vecs>( + exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p, + res_offset, depends); +} + +template +struct CopysignContigFactory +{ + fnT get() + { + if constexpr (!CopysignOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = copysign_contig_impl; + return fn; + } + } +}; + +template +struct CopysignTypeMapFactory +{ + /*! @brief get typeid for output type of divide(T1 x, T2 y) */ + std::enable_if_t::value, int> get() + { + using rT = typename CopysignOutputType::value_type; + return td_ns::GetTypeid{}.get(); + } +}; + +template +class copysign_strided_kernel; + +template +sycl::event + copysign_strided_impl(sycl::queue &exec_q, + std::size_t nelems, + int nd, + const ssize_t *shape_and_strides, + const char *arg1_p, + ssize_t arg1_offset, + const char *arg2_p, + ssize_t arg2_offset, + char *res_p, + ssize_t res_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + return elementwise_common::binary_strided_impl< + argTy1, argTy2, CopysignOutputType, CopysignStridedFunctor, + copysign_strided_kernel>(exec_q, nelems, nd, shape_and_strides, arg1_p, + arg1_offset, arg2_p, arg2_offset, res_p, + res_offset, depends, additional_depends); +} + +template +struct CopysignStridedFactory +{ + fnT get() + { + if constexpr (!CopysignOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = copysign_strided_impl; + return fn; + } + } +}; + +} // namespace dpctl::tensor::kernels::copysign diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/cos.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/cos.hpp new file mode 100644 index 000000000000..7bd47d54778b --- /dev/null +++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/cos.hpp @@ -0,0 +1,311 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for elementwise evaluation of COS(x) function. +//===---------------------------------------------------------------------===// + +#pragma once +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "sycl_complex.hpp" +#include "vec_size_util.hpp" + +#include "kernels/dpctl_tensor_types.hpp" +#include "kernels/elementwise_functions/common.hpp" + +#include "utils/type_dispatch_building.hpp" +#include "utils/type_utils.hpp" + +namespace dpctl::tensor::kernels::cos +{ + +using dpctl::tensor::ssize_t; +namespace td_ns = dpctl::tensor::type_dispatch; + +using dpctl::tensor::type_utils::is_complex; + +template +struct CosFunctor +{ + + // is function constant for given argT + using is_constant = typename std::false_type; + // constant value, if constant + // constexpr resT constant_value = resT{}; + // is function defined for sycl::vec + using supports_vec = typename std::false_type; + // do both argTy and resTy support sugroup store/load operation + using supports_sg_loadstore = typename std::negation< + std::disjunction, is_complex>>; + + resT operator()(const argT &in) const + { + if constexpr (is_complex::value) { + using realT = typename argT::value_type; + + static constexpr realT q_nan = + std::numeric_limits::quiet_NaN(); + + realT const &in_re = std::real(in); + realT const &in_im = std::imag(in); + + const bool in_re_finite = std::isfinite(in_re); + const bool in_im_finite = std::isfinite(in_im); + + /* + * Handle the nearly-non-exceptional cases where + * real and imaginary parts of input are finite. + */ + if (in_re_finite && in_im_finite) { + return exprm_ns::cos(exprm_ns::complex(in)); // cos(in); + } + + /* + * since cos(in) = cosh(I * in), for special cases, + * we return cosh(I * in). + */ + const realT x = -in_im; + const realT y = in_re; + + const bool xfinite = in_im_finite; + const bool yfinite = in_re_finite; + /* + * cosh(+-0 +- I Inf) = dNaN + I sign(d(+-0, dNaN))0. + * The sign of 0 in the result is unspecified. Choice = normally + * the same as dNaN. + * + * cosh(+-0 +- I NaN) = d(NaN) + I sign(d(+-0, NaN))0. + * The sign of 0 in the result is unspecified. Choice = normally + * the same as d(NaN). + */ + if (x == realT(0) && !yfinite) { + const realT y_m_y = (y - y); + const realT res_im = sycl::copysign(realT(0), x * y_m_y); + return resT{y_m_y, res_im}; + } + + /* + * cosh(+-Inf +- I 0) = +Inf + I (+-)(+-)0. + * + * cosh(NaN +- I 0) = d(NaN) + I sign(d(NaN, +-0))0. + * The sign of 0 in the result is unspecified. + */ + if (y == realT(0) && !xfinite) { + const realT res_im = sycl::copysign(realT(0), x) * y; + return resT{x * x, res_im}; + } + + /* + * cosh(x +- I Inf) = dNaN + I dNaN. + * + * cosh(x + I NaN) = d(NaN) + I d(NaN). + */ + if (xfinite && !yfinite) { + const realT y_m_y = (y - y); + return resT{y_m_y, x * y_m_y}; + } + + /* + * cosh(+-Inf + I NaN) = +Inf + I d(NaN). + * + * cosh(+-Inf +- I Inf) = +Inf + I dNaN. + * The sign of Inf in the result is unspecified. Choice = always +. + * + * cosh(+-Inf + I y) = +Inf cos(y) +- I Inf sin(y) + */ + if (std::isinf(x)) { + if (!yfinite) { + return resT{x * x, sycl::copysign(q_nan, x)}; + } + return resT{(x * x) * sycl::cos(y), x * sycl::sin(y)}; + } + + /* + * cosh(NaN + I NaN) = d(NaN) + I d(NaN). + * + * cosh(NaN +- I Inf) = d(NaN) + I d(NaN). + * + * cosh(NaN + I y) = d(NaN) + I d(NaN). + */ + return resT{(x * x) * q_nan, (x + x) * q_nan}; + } + else { + static_assert(std::is_floating_point_v || + std::is_same_v); + return sycl::cos(in); + } + } +}; + +template +using CosContigFunctor = + elementwise_common::UnaryContigFunctor, + vec_sz, + n_vecs, + enable_sg_loadstore>; + +template +using CosStridedFunctor = elementwise_common:: + UnaryStridedFunctor>; + +template +struct CosOutputType +{ + using value_type = typename std::disjunction< + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, std::complex>, + td_ns:: + TypeMapResultEntry, std::complex>, + td_ns::DefaultResultEntry>::result_type; + + static constexpr bool is_defined = !std::is_same_v; +}; + +namespace hyperparam_detail +{ + +namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; + +using vsu_ns::ContigHyperparameterSetDefault; +using vsu_ns::UnaryContigHyperparameterSetEntry; + +template +struct CosContigHyperparameterSet +{ + using value_type = + typename std::disjunction>; + + constexpr static auto vec_sz = value_type::vec_sz; + constexpr static auto n_vecs = value_type::n_vecs; +}; + +} // end of namespace hyperparam_detail + +template +class cos_contig_kernel; + +template +sycl::event cos_contig_impl(sycl::queue &exec_q, + std::size_t nelems, + const char *arg_p, + char *res_p, + const std::vector &depends = {}) +{ + using CosHS = hyperparam_detail::CosContigHyperparameterSet; + static constexpr std::uint8_t vec_sz = CosHS::vec_sz; + static constexpr std::uint8_t n_vecs = CosHS::n_vecs; + + return elementwise_common::unary_contig_impl< + argTy, CosOutputType, CosContigFunctor, cos_contig_kernel, vec_sz, + n_vecs>(exec_q, nelems, arg_p, res_p, depends); +} + +template +struct CosContigFactory +{ + fnT get() + { + if constexpr (!CosOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = cos_contig_impl; + return fn; + } + } +}; + +template +struct CosTypeMapFactory +{ + /*! @brief get typeid for output type of sycl::cos(T x) */ + std::enable_if_t::value, int> get() + { + using rT = typename CosOutputType::value_type; + return td_ns::GetTypeid{}.get(); + } +}; + +template +class cos_strided_kernel; + +template +sycl::event cos_strided_impl(sycl::queue &exec_q, + std::size_t nelems, + int nd, + const ssize_t *shape_and_strides, + const char *arg_p, + ssize_t arg_offset, + char *res_p, + ssize_t res_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + return elementwise_common::unary_strided_impl< + argTy, CosOutputType, CosStridedFunctor, cos_strided_kernel>( + exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p, + res_offset, depends, additional_depends); +} + +template +struct CosStridedFactory +{ + fnT get() + { + if constexpr (!CosOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = cos_strided_impl; + return fn; + } + } +}; + +} // namespace dpctl::tensor::kernels::cos diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/cosh.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/cosh.hpp new file mode 100644 index 000000000000..505eb5fffc29 --- /dev/null +++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/cosh.hpp @@ -0,0 +1,301 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for elementwise evaluation of COSH(x) function. +//===---------------------------------------------------------------------===// + +#pragma once +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "sycl_complex.hpp" +#include "vec_size_util.hpp" + +#include "kernels/dpctl_tensor_types.hpp" +#include "kernels/elementwise_functions/common.hpp" + +#include "utils/type_dispatch_building.hpp" +#include "utils/type_utils.hpp" + +namespace dpctl::tensor::kernels::cosh +{ + +using dpctl::tensor::ssize_t; +namespace td_ns = dpctl::tensor::type_dispatch; + +using dpctl::tensor::type_utils::is_complex; + +template +struct CoshFunctor +{ + + // is function constant for given argT + using is_constant = typename std::false_type; + // constant value, if constant + // constexpr resT constant_value = resT{}; + // is function defined for sycl::vec + using supports_vec = typename std::false_type; + // do both argTy and resTy support sugroup store/load operation + using supports_sg_loadstore = typename std::negation< + std::disjunction, is_complex>>; + + resT operator()(const argT &in) const + { + if constexpr (is_complex::value) { + using realT = typename argT::value_type; + + static constexpr realT q_nan = + std::numeric_limits::quiet_NaN(); + + const realT x = std::real(in); + const realT y = std::imag(in); + + const bool xfinite = std::isfinite(x); + const bool yfinite = std::isfinite(y); + + /* + * Handle the nearly-non-exceptional cases where + * real and imaginary parts of input are finite. + */ + if (xfinite && yfinite) { + return exprm_ns::cosh( + exprm_ns::complex(in)); // cosh(in); + } + + /* + * cosh(+-0 +- I Inf) = dNaN + I sign(d(+-0, dNaN))0. + * The sign of 0 in the result is unspecified. Choice = normally + * the same as dNaN. + * + * cosh(+-0 +- I NaN) = d(NaN) + I sign(d(+-0, NaN))0. + * The sign of 0 in the result is unspecified. Choice = normally + * the same as d(NaN). + */ + if (x == realT(0) && !yfinite) { + const realT res_im = sycl::copysign(realT(0), x * q_nan); + return resT{q_nan, res_im}; + } + + /* + * cosh(+-Inf +- I 0) = +Inf + I (+-)(+-)0. + * + * cosh(NaN +- I 0) = d(NaN) + I sign(d(NaN, +-0))0. + * The sign of 0 in the result is unspecified. + */ + if (y == realT(0) && !xfinite) { + const realT res_im = sycl::copysign(realT(0), x) * y; + return resT{x * x, res_im}; + } + + /* + * cosh(x +- I Inf) = dNaN + I dNaN. + * + * cosh(x + I NaN) = d(NaN) + I d(NaN). + */ + if (xfinite && !yfinite) { + return resT{q_nan, x * q_nan}; + } + + /* + * cosh(+-Inf + I NaN) = +Inf + I d(NaN). + * + * cosh(+-Inf +- I Inf) = +Inf + I dNaN. + * The sign of Inf in the result is unspecified. Choice = always +. + * + * cosh(+-Inf + I y) = +Inf cos(y) +- I Inf sin(y) + */ + if (std::isinf(x)) { + if (!yfinite) { + return resT{x * x, x * q_nan}; + } + return resT{(x * x) * sycl::cos(y), x * sycl::sin(y)}; + } + + /* + * cosh(NaN + I NaN) = d(NaN) + I d(NaN). + * + * cosh(NaN +- I Inf) = d(NaN) + I d(NaN). + * + * cosh(NaN + I y) = d(NaN) + I d(NaN). + */ + return resT{(x * x) * (y - y), (x + x) * (y - y)}; + } + else { + static_assert(std::is_floating_point_v || + std::is_same_v); + return sycl::cosh(in); + } + } +}; + +template +using CoshContigFunctor = + elementwise_common::UnaryContigFunctor, + vec_sz, + n_vecs, + enable_sg_loadstore>; + +template +using CoshStridedFunctor = elementwise_common:: + UnaryStridedFunctor>; + +template +struct CoshOutputType +{ + using value_type = typename std::disjunction< + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry>, + td_ns::TypeMapResultEntry>, + td_ns::DefaultResultEntry>::result_type; + + static constexpr bool is_defined = !std::is_same_v; +}; + +namespace hyperparam_detail +{ + +namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; + +using vsu_ns::ContigHyperparameterSetDefault; +using vsu_ns::UnaryContigHyperparameterSetEntry; + +template +struct CoshContigHyperparameterSet +{ + using value_type = + typename std::disjunction>; + + constexpr static auto vec_sz = value_type::vec_sz; + constexpr static auto n_vecs = value_type::n_vecs; +}; + +} // namespace hyperparam_detail + +template +class cosh_contig_kernel; + +template +sycl::event cosh_contig_impl(sycl::queue &exec_q, + std::size_t nelems, + const char *arg_p, + char *res_p, + const std::vector &depends = {}) +{ + using CoshHS = hyperparam_detail::CoshContigHyperparameterSet; + static constexpr std::uint8_t vec_sz = CoshHS::vec_sz; + static constexpr std::uint8_t n_vecs = CoshHS::n_vecs; + + return elementwise_common::unary_contig_impl< + argTy, CoshOutputType, CoshContigFunctor, cosh_contig_kernel, vec_sz, + n_vecs>(exec_q, nelems, arg_p, res_p, depends); +} + +template +struct CoshContigFactory +{ + fnT get() + { + if constexpr (!CoshOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = cosh_contig_impl; + return fn; + } + } +}; + +template +struct CoshTypeMapFactory +{ + /*! @brief get typeid for output type of sycl::cosh(T x) */ + std::enable_if_t::value, int> get() + { + using rT = typename CoshOutputType::value_type; + return td_ns::GetTypeid{}.get(); + } +}; + +template +class cosh_strided_kernel; + +template +sycl::event + cosh_strided_impl(sycl::queue &exec_q, + std::size_t nelems, + int nd, + const ssize_t *shape_and_strides, + const char *arg_p, + ssize_t arg_offset, + char *res_p, + ssize_t res_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + return elementwise_common::unary_strided_impl< + argTy, CoshOutputType, CoshStridedFunctor, cosh_strided_kernel>( + exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p, + res_offset, depends, additional_depends); +} + +template +struct CoshStridedFactory +{ + fnT get() + { + if constexpr (!CoshOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = cosh_strided_impl; + return fn; + } + } +}; + +} // namespace dpctl::tensor::kernels::cosh diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/equal.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/equal.hpp new file mode 100644 index 000000000000..07b3566c5cef --- /dev/null +++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/equal.hpp @@ -0,0 +1,316 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for elementwise evaluation of equality of +/// tensor elements. +//===---------------------------------------------------------------------===// + +#pragma once +#include +#include +#include +#include +#include + +#include + +#include "sycl_complex.hpp" +#include "vec_size_util.hpp" + +#include "kernels/dpctl_tensor_types.hpp" +#include "kernels/elementwise_functions/common.hpp" + +#include "utils/type_dispatch_building.hpp" +#include "utils/type_utils.hpp" + +namespace dpctl::tensor::kernels::equal +{ + +using dpctl::tensor::ssize_t; +namespace td_ns = dpctl::tensor::type_dispatch; +namespace tu_ns = dpctl::tensor::type_utils; + +template +struct EqualFunctor +{ + static_assert(std::is_same_v); + + using supports_sg_loadstore = std::negation< + std::disjunction, tu_ns::is_complex>>; + using supports_vec = std::conjunction< + std::is_same, + std::negation, + tu_ns::is_complex>>>; + + resT operator()(const argT1 &in1, const argT2 &in2) const + { + if constexpr (tu_ns::is_complex::value && + tu_ns::is_complex::value) { + using realT1 = typename argT1::value_type; + using realT2 = typename argT2::value_type; + + return exprm_ns::complex(in1) == + exprm_ns::complex(in2); + } + else { + if constexpr (std::is_integral_v && + std::is_integral_v && + std::is_signed_v != std::is_signed_v) { + if constexpr (std::is_signed_v && + !std::is_signed_v) { + return (in1 < 0) ? false : (static_cast(in1) == in2); + } + else { + if constexpr (!std::is_signed_v && + std::is_signed_v) { + return (in2 < 0) ? false + : (in1 == static_cast(in2)); + } + } + } + else { + return (in1 == in2); + } + } + } + + template + sycl::vec + operator()(const sycl::vec &in1, + const sycl::vec &in2) const + { + auto tmp = (in1 == in2); + if constexpr (std::is_same_v) { + return tmp; + } + else { + using dpctl::tensor::type_utils::vec_cast; + + return vec_cast( + tmp); + } + } +}; + +template +using EqualContigFunctor = + elementwise_common::BinaryContigFunctor, + vec_sz, + n_vecs, + enable_sg_loadstore>; + +template +using EqualStridedFunctor = + elementwise_common::BinaryStridedFunctor>; + +template +struct EqualOutputType +{ + using value_type = typename std::disjunction< + td_ns::BinaryTypeMapResultEntry, + td_ns:: + BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns:: + BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns:: + BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns:: + BinaryTypeMapResultEntry, + td_ns:: + BinaryTypeMapResultEntry, + td_ns:: + BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + T2, + std::complex, + bool>, + td_ns::BinaryTypeMapResultEntry, + T2, + std::complex, + bool>, + td_ns::DefaultResultEntry>::result_type; + + static constexpr bool is_defined = !std::is_same_v; +}; + +namespace hyperparam_detail +{ + +namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; + +using vsu_ns::BinaryContigHyperparameterSetEntry; +using vsu_ns::ContigHyperparameterSetDefault; + +template +struct EqualContigHyperparameterSet +{ + using value_type = + typename std::disjunction>; + + constexpr static auto vec_sz = value_type::vec_sz; + constexpr static auto n_vecs = value_type::n_vecs; +}; + +} // end of namespace hyperparam_detail + +template +class equal_contig_kernel; + +template +sycl::event equal_contig_impl(sycl::queue &exec_q, + std::size_t nelems, + const char *arg1_p, + ssize_t arg1_offset, + const char *arg2_p, + ssize_t arg2_offset, + char *res_p, + ssize_t res_offset, + const std::vector &depends = {}) +{ + using EqualHS = + hyperparam_detail::EqualContigHyperparameterSet; + static constexpr std::uint8_t vec_sz = EqualHS::vec_sz; + static constexpr std::uint8_t n_vecs = EqualHS::n_vecs; + + return elementwise_common::binary_contig_impl< + argTy1, argTy2, EqualOutputType, EqualContigFunctor, + equal_contig_kernel, vec_sz, n_vecs>(exec_q, nelems, arg1_p, + arg1_offset, arg2_p, arg2_offset, + res_p, res_offset, depends); +} + +template +struct EqualContigFactory +{ + fnT get() + { + if constexpr (!EqualOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = equal_contig_impl; + return fn; + } + } +}; + +template +struct EqualTypeMapFactory +{ + /*! @brief get typeid for output type of operator()==(x, y), always bool */ + std::enable_if_t::value, int> get() + { + using rT = typename EqualOutputType::value_type; + return td_ns::GetTypeid{}.get(); + } +}; + +template +class equal_strided_kernel; + +template +sycl::event + equal_strided_impl(sycl::queue &exec_q, + std::size_t nelems, + int nd, + const ssize_t *shape_and_strides, + const char *arg1_p, + ssize_t arg1_offset, + const char *arg2_p, + ssize_t arg2_offset, + char *res_p, + ssize_t res_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + return elementwise_common::binary_strided_impl< + argTy1, argTy2, EqualOutputType, EqualStridedFunctor, + equal_strided_kernel>(exec_q, nelems, nd, shape_and_strides, arg1_p, + arg1_offset, arg2_p, arg2_offset, res_p, + res_offset, depends, additional_depends); +} + +template +struct EqualStridedFactory +{ + fnT get() + { + if constexpr (!EqualOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = equal_strided_impl; + return fn; + } + } +}; + +} // namespace dpctl::tensor::kernels::equal diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/exp.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/exp.hpp new file mode 100644 index 000000000000..97789e53bb5a --- /dev/null +++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/exp.hpp @@ -0,0 +1,269 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for elementwise evaluation of EXP(x) function. +//===---------------------------------------------------------------------===// + +#pragma once +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "sycl_complex.hpp" +#include "vec_size_util.hpp" + +#include "kernels/dpctl_tensor_types.hpp" +#include "kernels/elementwise_functions/common.hpp" + +#include "utils/type_dispatch_building.hpp" +#include "utils/type_utils.hpp" + +namespace dpctl::tensor::kernels::exp +{ + +using dpctl::tensor::ssize_t; +namespace td_ns = dpctl::tensor::type_dispatch; + +using dpctl::tensor::type_utils::is_complex; + +template +struct ExpFunctor +{ + // is function constant for given argT + using is_constant = typename std::false_type; + // constant value, if constant + // constexpr resT constant_value = resT{}; + // is function defined for sycl::vec + using supports_vec = typename std::false_type; + // do both argTy and resTy support sugroup store/load operation + using supports_sg_loadstore = typename std::negation< + std::disjunction, is_complex>>; + + resT operator()(const argT &in) const + { + if constexpr (is_complex::value) { + using realT = typename argT::value_type; + + static constexpr realT q_nan = + std::numeric_limits::quiet_NaN(); + + const realT x = std::real(in); + const realT y = std::imag(in); + if (std::isfinite(x)) { + if (std::isfinite(y)) { + return exprm_ns::exp( + exprm_ns::complex(in)); // exp(in); + } + else { + return resT{q_nan, q_nan}; + } + } + else if (std::isnan(x)) { + /* x is nan */ + if (y == realT(0)) { + return resT{in}; + } + else { + return resT{x, q_nan}; + } + } + else { + if (!sycl::signbit(x)) { /* x is +inf */ + if (y == realT(0)) { + return resT{x, y}; + } + else if (std::isfinite(y)) { + return resT{x * sycl::cos(y), x * sycl::sin(y)}; + } + else { + /* x = +inf, y = +-inf || nan */ + return resT{x, q_nan}; + } + } + else { /* x is -inf */ + if (std::isfinite(y)) { + realT exp_x = sycl::exp(x); + return resT{exp_x * sycl::cos(y), exp_x * sycl::sin(y)}; + } + else { + /* x = -inf, y = +-inf || nan */ + return resT{0, 0}; + } + } + } + } + else { + return sycl::exp(in); + } + } +}; + +template +using ExpContigFunctor = + elementwise_common::UnaryContigFunctor, + vec_sz, + n_vecs, + enable_sg_loadstore>; + +template +using ExpStridedFunctor = elementwise_common:: + UnaryStridedFunctor>; + +template +struct ExpOutputType +{ + using value_type = typename std::disjunction< + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry>, + td_ns::TypeMapResultEntry>, + td_ns::DefaultResultEntry>::result_type; + + static constexpr bool is_defined = !std::is_same_v; +}; + +namespace hyperparam_detail +{ + +namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; + +using vsu_ns::ContigHyperparameterSetDefault; +using vsu_ns::UnaryContigHyperparameterSetEntry; + +template +struct ExpContigHyperparameterSet +{ + using value_type = + typename std::disjunction>; + + constexpr static auto vec_sz = value_type::vec_sz; + constexpr static auto n_vecs = value_type::n_vecs; +}; + +} // end of namespace hyperparam_detail + +template +class exp_contig_kernel; + +template +sycl::event exp_contig_impl(sycl::queue &exec_q, + std::size_t nelems, + const char *arg_p, + char *res_p, + const std::vector &depends = {}) +{ + using ExpHS = hyperparam_detail::ExpContigHyperparameterSet; + static constexpr std::uint8_t vec_sz = ExpHS::vec_sz; + static constexpr std::uint8_t n_vecs = ExpHS::n_vecs; + + return elementwise_common::unary_contig_impl< + argTy, ExpOutputType, ExpContigFunctor, exp_contig_kernel, vec_sz, + n_vecs>(exec_q, nelems, arg_p, res_p, depends); +} + +template +struct ExpContigFactory +{ + fnT get() + { + if constexpr (!ExpOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = exp_contig_impl; + return fn; + } + } +}; + +template +struct ExpTypeMapFactory +{ + /*! @brief get typeid for output type of sycl::exp(T x) */ + std::enable_if_t::value, int> get() + { + using rT = typename ExpOutputType::value_type; + return td_ns::GetTypeid{}.get(); + } +}; + +template +class exp_strided_kernel; + +template +sycl::event exp_strided_impl(sycl::queue &exec_q, + std::size_t nelems, + int nd, + const ssize_t *shape_and_strides, + const char *arg_p, + ssize_t arg_offset, + char *res_p, + ssize_t res_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + return elementwise_common::unary_strided_impl< + argTy, ExpOutputType, ExpStridedFunctor, exp_strided_kernel>( + exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p, + res_offset, depends, additional_depends); +} + +template +struct ExpStridedFactory +{ + fnT get() + { + if constexpr (!ExpOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = exp_strided_impl; + return fn; + } + } +}; + +} // namespace dpctl::tensor::kernels::exp diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/exp2.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/exp2.hpp new file mode 100644 index 000000000000..dd09f4eee342 --- /dev/null +++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/exp2.hpp @@ -0,0 +1,272 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for elementwise evaluation of EXP2(x) function. +//===---------------------------------------------------------------------===// + +#pragma once +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "sycl_complex.hpp" +#include "vec_size_util.hpp" + +#include "kernels/dpctl_tensor_types.hpp" +#include "kernels/elementwise_functions/common.hpp" + +#include "utils/type_dispatch_building.hpp" +#include "utils/type_utils.hpp" + +namespace dpctl::tensor::kernels::exp2 +{ + +using dpctl::tensor::ssize_t; +namespace td_ns = dpctl::tensor::type_dispatch; + +using dpctl::tensor::type_utils::is_complex; + +template +struct Exp2Functor +{ + // is function constant for given argT + using is_constant = typename std::false_type; + // constant value, if constant + // constexpr resT constant_value = resT{}; + // is function defined for sycl::vec + using supports_vec = typename std::false_type; + // do both argTy and resTy support sugroup store/load operation + using supports_sg_loadstore = typename std::negation< + std::disjunction, is_complex>>; + + resT operator()(const argT &in) const + { + if constexpr (is_complex::value) { + using realT = typename argT::value_type; + + const argT tmp = in * sycl::log(realT(2)); + + static constexpr realT q_nan = + std::numeric_limits::quiet_NaN(); + + const realT x = std::real(tmp); + const realT y = std::imag(tmp); + if (std::isfinite(x)) { + if (std::isfinite(y)) { + return exprm_ns::exp(exprm_ns::complex(tmp)); + } + else { + return resT{q_nan, q_nan}; + } + } + else if (std::isnan(x)) { + /* x is nan */ + if (y == realT(0)) { + return resT{in}; + } + else { + return resT{x, q_nan}; + } + } + else { + if (!sycl::signbit(x)) { /* x is +inf */ + if (y == realT(0)) { + return resT{x, y}; + } + else if (std::isfinite(y)) { + return resT{x * sycl::cos(y), x * sycl::sin(y)}; + } + else { + /* x = +inf, y = +-inf || nan */ + return resT{x, q_nan}; + } + } + else { /* x is -inf */ + if (std::isfinite(y)) { + realT exp_x = sycl::exp(x); + return resT{exp_x * sycl::cos(y), exp_x * sycl::sin(y)}; + } + else { + /* x = -inf, y = +-inf || nan */ + return resT{0, 0}; + } + } + } + } + else { + return sycl::exp2(in); + } + } +}; + +template +using Exp2ContigFunctor = + elementwise_common::UnaryContigFunctor, + vec_sz, + n_vecs, + enable_sg_loadstore>; + +template +using Exp2StridedFunctor = elementwise_common:: + UnaryStridedFunctor>; + +template +struct Exp2OutputType +{ + using value_type = typename std::disjunction< + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry>, + td_ns::TypeMapResultEntry>, + td_ns::DefaultResultEntry>::result_type; + + static constexpr bool is_defined = !std::is_same_v; +}; + +namespace hyperparam_detail +{ + +namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; + +using vsu_ns::ContigHyperparameterSetDefault; +using vsu_ns::UnaryContigHyperparameterSetEntry; + +template +struct Exp2ContigHyperparameterSet +{ + using value_type = + typename std::disjunction>; + + constexpr static auto vec_sz = value_type::vec_sz; + constexpr static auto n_vecs = value_type::n_vecs; +}; + +} // end of namespace hyperparam_detail + +template +class exp2_contig_kernel; + +template +sycl::event exp2_contig_impl(sycl::queue &exec_q, + std::size_t nelems, + const char *arg_p, + char *res_p, + const std::vector &depends = {}) +{ + using Exp2HS = hyperparam_detail::Exp2ContigHyperparameterSet; + + static constexpr std::uint8_t vec_sz = Exp2HS::vec_sz; + static constexpr std::uint8_t n_vecs = Exp2HS::n_vecs; + + return elementwise_common::unary_contig_impl< + argTy, Exp2OutputType, Exp2ContigFunctor, exp2_contig_kernel, vec_sz, + n_vecs>(exec_q, nelems, arg_p, res_p, depends); +} + +template +struct Exp2ContigFactory +{ + fnT get() + { + if constexpr (!Exp2OutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = exp2_contig_impl; + return fn; + } + } +}; + +template +struct Exp2TypeMapFactory +{ + /*! @brief get typeid for output type of sycl::exp2(T x) */ + std::enable_if_t::value, int> get() + { + using rT = typename Exp2OutputType::value_type; + return td_ns::GetTypeid{}.get(); + } +}; + +template +class exp2_strided_kernel; + +template +sycl::event + exp2_strided_impl(sycl::queue &exec_q, + std::size_t nelems, + int nd, + const ssize_t *shape_and_strides, + const char *arg_p, + ssize_t arg_offset, + char *res_p, + ssize_t res_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + return elementwise_common::unary_strided_impl< + argTy, Exp2OutputType, Exp2StridedFunctor, exp2_strided_kernel>( + exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p, + res_offset, depends, additional_depends); +} + +template +struct Exp2StridedFactory +{ + fnT get() + { + if constexpr (!Exp2OutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = exp2_strided_impl; + return fn; + } + } +}; + +} // namespace dpctl::tensor::kernels::exp2 diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/expm1.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/expm1.hpp new file mode 100644 index 000000000000..c29030a6dc95 --- /dev/null +++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/expm1.hpp @@ -0,0 +1,282 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for elementwise evaluation of EXPM1(x) function. +//===---------------------------------------------------------------------===// + +#pragma once +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "vec_size_util.hpp" + +#include "kernels/dpctl_tensor_types.hpp" +#include "kernels/elementwise_functions/common.hpp" + +#include "utils/type_dispatch_building.hpp" +#include "utils/type_utils.hpp" + +namespace dpctl::tensor::kernels::expm1 +{ + +using dpctl::tensor::ssize_t; +namespace td_ns = dpctl::tensor::type_dispatch; + +using dpctl::tensor::type_utils::is_complex; + +template +struct Expm1Functor +{ + + // is function constant for given argT + using is_constant = typename std::false_type; + // constant value, if constant + // constexpr resT constant_value = resT{}; + // is function defined for sycl::vec + using supports_vec = typename std::false_type; + // do both argTy and resTy support sugroup store/load operation + using supports_sg_loadstore = typename std::negation< + std::disjunction, is_complex>>; + + resT operator()(const argT &in) const + { + if constexpr (is_complex::value) { + using realT = typename argT::value_type; + // expm1(x + I*y) = expm1(x)*cos(y) - 2*sin(y / 2)^2 + + // I*exp(x)*sin(y) + const realT x = std::real(in); + const realT y = std::imag(in); + + // special cases + if (std::isinf(x)) { + if (x > realT(0)) { + // positive infinity cases + if (!std::isfinite(y)) { + return resT{x, std::numeric_limits::quiet_NaN()}; + } + else if (y == realT(0)) { + return in; + } + else { + return (resT{sycl::copysign(x, sycl::cos(y)), + sycl::copysign(x, sycl::sin(y))}); + } + } + else { + // negative infinity cases + if (!std::isfinite(y)) { + // copy sign of y to guarantee + // conj(expm1(x)) == expm1(conj(x)) + return resT{realT(-1), sycl::copysign(realT(0), y)}; + } + else { + return resT{realT(-1), + sycl::copysign(realT(0), sycl::sin(y))}; + } + } + } + + if (std::isnan(x)) { + if (y == realT(0)) { + return in; + } + else { + return resT{std::numeric_limits::quiet_NaN(), + std::numeric_limits::quiet_NaN()}; + } + } + + // x, y finite numbers + const realT cosY_val = sycl::cos(y); + const realT sinY_val = (y == 0) ? y : sycl::sin(y); + const realT sinhalfY_val = (y == 0) ? y : sycl::sin(y / 2); + + const realT res_re = + sycl::expm1(x) * cosY_val - 2 * sinhalfY_val * sinhalfY_val; + realT res_im = sycl::exp(x) * sinY_val; + return resT{res_re, res_im}; + } + else { + static_assert(std::is_floating_point_v || + std::is_same_v); + static_assert(std::is_same_v); + if (in == 0) { + return in; + } + return sycl::expm1(in); + } + } +}; + +template +using Expm1ContigFunctor = + elementwise_common::UnaryContigFunctor, + vec_sz, + n_vecs, + enable_sg_loadstore>; + +template +using Expm1StridedFunctor = elementwise_common:: + UnaryStridedFunctor>; + +template +struct Expm1OutputType +{ + using value_type = typename std::disjunction< + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, std::complex>, + td_ns:: + TypeMapResultEntry, std::complex>, + td_ns::DefaultResultEntry>::result_type; + + static constexpr bool is_defined = !std::is_same_v; +}; + +namespace hyperparam_detail +{ + +namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; + +using vsu_ns::ContigHyperparameterSetDefault; +using vsu_ns::UnaryContigHyperparameterSetEntry; + +template +struct Expm1ContigHyperparameterSet +{ + using value_type = + typename std::disjunction>; + + constexpr static auto vec_sz = value_type::vec_sz; + constexpr static auto n_vecs = value_type::n_vecs; +}; + +} // end of namespace hyperparam_detail + +template +class expm1_contig_kernel; + +template +sycl::event expm1_contig_impl(sycl::queue &exec_q, + std::size_t nelems, + const char *arg_p, + char *res_p, + const std::vector &depends = {}) +{ + using Expm1HS = hyperparam_detail::Expm1ContigHyperparameterSet; + static constexpr std::uint8_t vec_sz = Expm1HS::vec_sz; + static constexpr std::uint8_t n_vecs = Expm1HS::n_vecs; + + return elementwise_common::unary_contig_impl< + argTy, Expm1OutputType, Expm1ContigFunctor, expm1_contig_kernel, vec_sz, + n_vecs>(exec_q, nelems, arg_p, res_p, depends); +} + +template +struct Expm1ContigFactory +{ + fnT get() + { + if constexpr (!Expm1OutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = expm1_contig_impl; + return fn; + } + } +}; + +template +struct Expm1TypeMapFactory +{ + /*! @brief get typeid for output type of sycl::expm1(T x) */ + std::enable_if_t::value, int> get() + { + using rT = typename Expm1OutputType::value_type; + return td_ns::GetTypeid{}.get(); + } +}; + +template +class expm1_strided_kernel; + +template +sycl::event + expm1_strided_impl(sycl::queue &exec_q, + std::size_t nelems, + int nd, + const ssize_t *shape_and_strides, + const char *arg_p, + ssize_t arg_offset, + char *res_p, + ssize_t res_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + return elementwise_common::unary_strided_impl< + argTy, Expm1OutputType, Expm1StridedFunctor, expm1_strided_kernel>( + exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p, + res_offset, depends, additional_depends); +} + +template +struct Expm1StridedFactory +{ + fnT get() + { + if constexpr (!Expm1OutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = expm1_strided_impl; + return fn; + } + } +}; + +} // namespace dpctl::tensor::kernels::expm1 diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/floor.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/floor.hpp new file mode 100644 index 000000000000..375659b94a12 --- /dev/null +++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/floor.hpp @@ -0,0 +1,229 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for elementwise evaluation of FLOOR(x) function. +//===---------------------------------------------------------------------===// + +#pragma once +#include +#include +#include +#include + +#include + +#include "vec_size_util.hpp" + +#include "kernels/dpctl_tensor_types.hpp" +#include "kernels/elementwise_functions/common.hpp" + +#include "utils/type_dispatch_building.hpp" +#include "utils/type_utils.hpp" + +namespace dpctl::tensor::kernels::floor +{ + +using dpctl::tensor::ssize_t; +namespace td_ns = dpctl::tensor::type_dispatch; + +using dpctl::tensor::type_utils::is_complex; + +template +struct FloorFunctor +{ + + // is function constant for given argT + using is_constant = typename std::false_type; + // constant value, if constant + // constexpr resT constant_value = resT{}; + // is function defined for sycl::vec + using supports_vec = typename std::false_type; + // do both argTy and resTy support sugroup store/load operation + using supports_sg_loadstore = typename std::negation< + std::disjunction, is_complex>>; + + resT operator()(const argT &in) const + { + if constexpr (std::is_integral_v) { + return in; + } + else { + if (in == 0) { + return in; + } + return sycl::floor(in); + } + } +}; + +template +using FloorContigFunctor = + elementwise_common::UnaryContigFunctor, + vec_sz, + n_vecs, + enable_sg_loadstore>; + +template +using FloorStridedFunctor = elementwise_common:: + UnaryStridedFunctor>; + +template +struct FloorOutputType +{ + using value_type = + typename std::disjunction, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::DefaultResultEntry>::result_type; + + static constexpr bool is_defined = !std::is_same_v; +}; + +namespace hyperparam_detail +{ + +namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; + +using vsu_ns::ContigHyperparameterSetDefault; +using vsu_ns::UnaryContigHyperparameterSetEntry; + +template +struct FloorContigHyperparameterSet +{ + using value_type = + typename std::disjunction>; + + constexpr static auto vec_sz = value_type::vec_sz; + constexpr static auto n_vecs = value_type::n_vecs; +}; + +} // end of namespace hyperparam_detail + +template +class floor_contig_kernel; + +template +sycl::event floor_contig_impl(sycl::queue &exec_q, + std::size_t nelems, + const char *arg_p, + char *res_p, + const std::vector &depends = {}) +{ + using FloorHS = hyperparam_detail::FloorContigHyperparameterSet; + static constexpr std::uint8_t vec_sz = FloorHS::vec_sz; + static constexpr std::uint8_t n_vecs = FloorHS::n_vecs; + + return elementwise_common::unary_contig_impl< + argTy, FloorOutputType, FloorContigFunctor, floor_contig_kernel, vec_sz, + n_vecs>(exec_q, nelems, arg_p, res_p, depends); +} + +template +struct FloorContigFactory +{ + fnT get() + { + if constexpr (!FloorOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = floor_contig_impl; + return fn; + } + } +}; + +template +struct FloorTypeMapFactory +{ + /*! @brief get typeid for output type of sycl::floor(T x) */ + std::enable_if_t::value, int> get() + { + using rT = typename FloorOutputType::value_type; + return td_ns::GetTypeid{}.get(); + } +}; + +template +class floor_strided_kernel; + +template +sycl::event + floor_strided_impl(sycl::queue &exec_q, + std::size_t nelems, + int nd, + const ssize_t *shape_and_strides, + const char *arg_p, + ssize_t arg_offset, + char *res_p, + ssize_t res_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + return elementwise_common::unary_strided_impl< + argTy, FloorOutputType, FloorStridedFunctor, floor_strided_kernel>( + exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p, + res_offset, depends, additional_depends); +} + +template +struct FloorStridedFactory +{ + fnT get() + { + if constexpr (!FloorOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = floor_strided_impl; + return fn; + } + } +}; + +} // namespace dpctl::tensor::kernels::floor diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/floor_divide.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/floor_divide.hpp new file mode 100644 index 000000000000..e669a97c04ea --- /dev/null +++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/floor_divide.hpp @@ -0,0 +1,546 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for elementwise evaluation of FLOOR_DIVIDE(x1, x2) +/// function. +//===---------------------------------------------------------------------===// + +#pragma once +#include +#include +#include +#include + +#include + +#include "vec_size_util.hpp" + +#include "kernels/dpctl_tensor_types.hpp" +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/common_inplace.hpp" + +#include "utils/type_dispatch_building.hpp" +#include "utils/type_utils.hpp" + +namespace dpctl::tensor::kernels::floor_divide +{ + +using dpctl::tensor::ssize_t; +namespace td_ns = dpctl::tensor::type_dispatch; +namespace tu_ns = dpctl::tensor::type_utils; + +template +struct FloorDivideFunctor +{ + using supports_sg_loadstore = std::true_type; + using supports_vec = std::true_type; + + resT operator()(const argT1 &in1, const argT2 &in2) const + { + if constexpr (std::is_integral_v || std::is_integral_v) { + if (in2 == argT2(0)) { + return resT(0); + } + if constexpr (std::is_signed_v || std::is_signed_v) { + auto div = in1 / in2; + auto mod = in1 % in2; + auto corr = (mod != 0 && l_xor(mod < 0, in2 < 0)); + return (div - corr); + } + else { + return (in1 / in2); + } + } + else { + auto div = in1 / in2; + return (div == resT(0)) ? div : resT(sycl::floor(div)); + } + } + + template + sycl::vec + operator()(const sycl::vec &in1, + const sycl::vec &in2) const + { + if constexpr (std::is_integral_v) { + sycl::vec res; +#pragma unroll + for (int i = 0; i < vec_sz; ++i) { + if (in2[i] == argT2(0)) { + res[i] = resT(0); + } + else { + res[i] = in1[i] / in2[i]; + if constexpr (std::is_signed_v) { + auto mod = in1[i] % in2[i]; + auto corr = (mod != 0 && l_xor(mod < 0, in2[i] < 0)); + res[i] -= corr; + } + } + } + return res; + } + else { + auto tmp = in1 / in2; + using tmpT = typename decltype(tmp)::element_type; +#pragma unroll + for (int i = 0; i < vec_sz; ++i) { + if (in2[i] != argT2(0)) { + tmp[i] = sycl::floor(tmp[i]); + } + } + if constexpr (std::is_same_v) { + return tmp; + } + else { + using dpctl::tensor::type_utils::vec_cast; + return vec_cast(tmp); + } + } + } + +private: + bool l_xor(bool b1, bool b2) const { return (b1 != b2); } +}; + +template +using FloorDivideContigFunctor = elementwise_common::BinaryContigFunctor< + argT1, + argT2, + resT, + FloorDivideFunctor, + vec_sz, + n_vecs, + enable_sg_loadstore>; + +template +using FloorDivideStridedFunctor = elementwise_common::BinaryStridedFunctor< + argT1, + argT2, + resT, + IndexerT, + FloorDivideFunctor>; + +template +struct FloorDivideOutputType +{ + using value_type = typename std::disjunction< + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::DefaultResultEntry>::result_type; + + static constexpr bool is_defined = !std::is_same_v; +}; + +namespace hyperparam_detail +{ + +namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; + +using vsu_ns::BinaryContigHyperparameterSetEntry; +using vsu_ns::ContigHyperparameterSetDefault; + +template +struct FloorDivideContigHyperparameterSet +{ + using value_type = + typename std::disjunction>; + + constexpr static auto vec_sz = value_type::vec_sz; + constexpr static auto n_vecs = value_type::n_vecs; +}; + +} // end of namespace hyperparam_detail + +template +class floor_divide_contig_kernel; + +template +sycl::event + floor_divide_contig_impl(sycl::queue &exec_q, + std::size_t nelems, + const char *arg1_p, + ssize_t arg1_offset, + const char *arg2_p, + ssize_t arg2_offset, + char *res_p, + ssize_t res_offset, + const std::vector &depends = {}) +{ + using FloorDivideHS = + hyperparam_detail::FloorDivideContigHyperparameterSet; + static constexpr std::uint8_t vec_sz = FloorDivideHS::vec_sz; + static constexpr std::uint8_t n_vecs = FloorDivideHS::n_vecs; + + return elementwise_common::binary_contig_impl< + argTy1, argTy2, FloorDivideOutputType, FloorDivideContigFunctor, + floor_divide_contig_kernel, vec_sz, n_vecs>( + exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p, + res_offset, depends); +} + +template +struct FloorDivideContigFactory +{ + fnT get() + { + if constexpr (!FloorDivideOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = floor_divide_contig_impl; + return fn; + } + } +}; + +template +struct FloorDivideTypeMapFactory +{ + /*! @brief get typeid for output type of floor_divide(T1 x, T2 y) */ + std::enable_if_t::value, int> get() + { + using rT = typename FloorDivideOutputType::value_type; + return td_ns::GetTypeid{}.get(); + } +}; + +template +class floor_divide_strided_kernel; + +template +sycl::event floor_divide_strided_impl( + sycl::queue &exec_q, + std::size_t nelems, + int nd, + const ssize_t *shape_and_strides, + const char *arg1_p, + ssize_t arg1_offset, + const char *arg2_p, + ssize_t arg2_offset, + char *res_p, + ssize_t res_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + return elementwise_common::binary_strided_impl< + argTy1, argTy2, FloorDivideOutputType, FloorDivideStridedFunctor, + floor_divide_strided_kernel>( + exec_q, nelems, nd, shape_and_strides, arg1_p, arg1_offset, arg2_p, + arg2_offset, res_p, res_offset, depends, additional_depends); +} + +template +struct FloorDivideStridedFactory +{ + fnT get() + { + if constexpr (!FloorDivideOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = floor_divide_strided_impl; + return fn; + } + } +}; + +template +struct FloorDivideInplaceFunctor +{ + using supports_sg_loadstore = std::true_type; + using supports_vec = std::true_type; + + void operator()(resT &in1, const argT &in2) const + { + if constexpr (std::is_integral_v) { + if (in2 == argT(0)) { + in1 = 0; + return; + } + if constexpr (std::is_signed_v) { + auto tmp = in1; + in1 /= in2; + auto mod = tmp % in2; + auto corr = (mod != 0 && l_xor(mod < 0, in2 < 0)); + in1 -= corr; + } + else { + in1 /= in2; + } + } + else { + in1 /= in2; + if (in1 == resT(0)) { + return; + } + in1 = sycl::floor(in1); + } + } + + template + void operator()(sycl::vec &in1, + const sycl::vec &in2) const + { + if constexpr (std::is_integral_v) { +#pragma unroll + for (int i = 0; i < vec_sz; ++i) { + if (in2[i] == argT(0)) { + in1[i] = 0; + } + else { + if constexpr (std::is_signed_v) { + auto tmp = in1[i]; + in1[i] /= in2[i]; + auto mod = tmp % in2[i]; + auto corr = (mod != 0 && l_xor(mod < 0, in2[i] < 0)); + in1[i] -= corr; + } + else { + in1[i] /= in2[i]; + } + } + } + } + else { + in1 /= in2; +#pragma unroll + for (int i = 0; i < vec_sz; ++i) { + if (in2[i] != argT(0)) { + in1[i] = sycl::floor(in1[i]); + } + } + } + } + +private: + bool l_xor(bool b1, bool b2) const { return (b1 != b2); } +}; + +template +using FloorDivideInplaceContigFunctor = + elementwise_common::BinaryInplaceContigFunctor< + argT, + resT, + FloorDivideInplaceFunctor, + vec_sz, + n_vecs, + enable_sg_loadstore>; + +template +using FloorDivideInplaceStridedFunctor = + elementwise_common::BinaryInplaceStridedFunctor< + argT, + resT, + IndexerT, + FloorDivideInplaceFunctor>; + +template +class floor_divide_inplace_contig_kernel; + +/* @brief Types supported by in-place floor division */ +template +struct FloorDivideInplaceTypePairSupport +{ + /* value if true a kernel for must be instantiated */ + static constexpr bool is_defined = std::disjunction< + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + // fall-through + td_ns::NotDefinedEntry>::is_defined; +}; + +template +struct FloorDivideInplaceTypeMapFactory +{ + /*! @brief get typeid for output type of x //= y */ + std::enable_if_t::value, int> get() + { + if constexpr (FloorDivideInplaceTypePairSupport::is_defined) { + return td_ns::GetTypeid{}.get(); + } + else { + return td_ns::GetTypeid{}.get(); + } + } +}; + +template +sycl::event floor_divide_inplace_contig_impl( + sycl::queue &exec_q, + std::size_t nelems, + const char *arg_p, + ssize_t arg_offset, + char *res_p, + ssize_t res_offset, + const std::vector &depends = {}) +{ + using FloorDivideHS = + hyperparam_detail::FloorDivideContigHyperparameterSet; + + static constexpr std::uint8_t vec_sz = FloorDivideHS::vec_sz; + static constexpr std::uint8_t n_vecs = FloorDivideHS::n_vecs; + + return elementwise_common::binary_inplace_contig_impl< + argTy, resTy, FloorDivideInplaceContigFunctor, + floor_divide_inplace_contig_kernel, vec_sz, n_vecs>( + exec_q, nelems, arg_p, arg_offset, res_p, res_offset, depends); +} + +template +struct FloorDivideInplaceContigFactory +{ + fnT get() + { + if constexpr (!FloorDivideInplaceTypePairSupport::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = floor_divide_inplace_contig_impl; + return fn; + } + } +}; + +template +class floor_divide_inplace_strided_kernel; + +template +sycl::event floor_divide_inplace_strided_impl( + sycl::queue &exec_q, + std::size_t nelems, + int nd, + const ssize_t *shape_and_strides, + const char *arg_p, + ssize_t arg_offset, + char *res_p, + ssize_t res_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + return elementwise_common::binary_inplace_strided_impl< + argTy, resTy, FloorDivideInplaceStridedFunctor, + floor_divide_inplace_strided_kernel>( + exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p, + res_offset, depends, additional_depends); +} + +template +struct FloorDivideInplaceStridedFactory +{ + fnT get() + { + if constexpr (!FloorDivideInplaceTypePairSupport::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = floor_divide_inplace_strided_impl; + return fn; + } + } +}; + +} // namespace dpctl::tensor::kernels::floor_divide diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/greater.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/greater.hpp new file mode 100644 index 000000000000..9b3659faa161 --- /dev/null +++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/greater.hpp @@ -0,0 +1,317 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for elementwise evaluation of comparison of +/// tensor elements. +//===---------------------------------------------------------------------===// + +#pragma once +#include +#include +#include +#include +#include + +#include + +#include "vec_size_util.hpp" + +#include "utils/math_utils.hpp" +#include "utils/type_dispatch_building.hpp" +#include "utils/type_utils.hpp" + +#include "kernels/dpctl_tensor_types.hpp" +#include "kernels/elementwise_functions/common.hpp" + +namespace dpctl::tensor::kernels::greater +{ + +using dpctl::tensor::ssize_t; +namespace td_ns = dpctl::tensor::type_dispatch; +namespace tu_ns = dpctl::tensor::type_utils; + +template +struct GreaterFunctor +{ + static_assert(std::is_same_v); + + using supports_sg_loadstore = std::negation< + std::disjunction, tu_ns::is_complex>>; + using supports_vec = std::conjunction< + std::is_same, + std::negation, + tu_ns::is_complex>>>; + + resT operator()(const argT1 &in1, const argT2 &in2) const + { + if constexpr (tu_ns::is_complex::value || + tu_ns::is_complex::value) { + static_assert(std::is_same_v); + using dpctl::tensor::math_utils::greater_complex; + return greater_complex(in1, in2); + } + else { + if constexpr (std::is_integral_v && + std::is_integral_v && + std::is_signed_v != std::is_signed_v) { + if constexpr (std::is_signed_v && + !std::is_signed_v) { + return (in1 < 0) ? false : (static_cast(in1) > in2); + } + else { + if constexpr (!std::is_signed_v && + std::is_signed_v) { + return (in2 < 0) ? true + : (in1 > static_cast(in2)); + } + } + } + else { + return (in1 > in2); + } + } + } + + template + sycl::vec + operator()(const sycl::vec &in1, + const sycl::vec &in2) const + { + + auto tmp = (in1 > in2); + + if constexpr (std::is_same_v) { + return tmp; + } + else { + using dpctl::tensor::type_utils::vec_cast; + + return vec_cast( + tmp); + } + } +}; + +template +using GreaterContigFunctor = + elementwise_common::BinaryContigFunctor, + vec_sz, + n_vecs, + enable_sg_loadstore>; + +template +using GreaterStridedFunctor = elementwise_common::BinaryStridedFunctor< + argT1, + argT2, + resT, + IndexerT, + GreaterFunctor>; + +template +struct GreaterOutputType +{ + using value_type = typename std::disjunction< + td_ns::BinaryTypeMapResultEntry, + td_ns:: + BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns:: + BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns:: + BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns:: + BinaryTypeMapResultEntry, + td_ns:: + BinaryTypeMapResultEntry, + td_ns:: + BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + T2, + std::complex, + bool>, + td_ns::BinaryTypeMapResultEntry, + T2, + std::complex, + bool>, + td_ns::DefaultResultEntry>::result_type; + + static constexpr bool is_defined = !std::is_same_v; +}; + +namespace hyperparam_detail +{ + +namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; + +using vsu_ns::BinaryContigHyperparameterSetEntry; +using vsu_ns::ContigHyperparameterSetDefault; + +template +struct GreaterContigHyperparameterSet +{ + using value_type = + typename std::disjunction>; + + constexpr static auto vec_sz = value_type::vec_sz; + constexpr static auto n_vecs = value_type::n_vecs; +}; + +} // end of namespace hyperparam_detail + +template +class greater_contig_kernel; + +template +sycl::event greater_contig_impl(sycl::queue &exec_q, + std::size_t nelems, + const char *arg1_p, + ssize_t arg1_offset, + const char *arg2_p, + ssize_t arg2_offset, + char *res_p, + ssize_t res_offset, + const std::vector &depends = {}) +{ + using GreaterHS = + hyperparam_detail::GreaterContigHyperparameterSet; + + static constexpr std::uint8_t vec_sz = GreaterHS::vec_sz; + static constexpr std::uint8_t n_vecs = GreaterHS::n_vecs; + + return elementwise_common::binary_contig_impl< + argTy1, argTy2, GreaterOutputType, GreaterContigFunctor, + greater_contig_kernel, vec_sz, n_vecs>(exec_q, nelems, arg1_p, + arg1_offset, arg2_p, arg2_offset, + res_p, res_offset, depends); +} + +template +struct GreaterContigFactory +{ + fnT get() + { + if constexpr (!GreaterOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = greater_contig_impl; + return fn; + } + } +}; + +template +struct GreaterTypeMapFactory +{ + /*! @brief get typeid for output type of operator()>(x, y), always bool */ + std::enable_if_t::value, int> get() + { + using rT = typename GreaterOutputType::value_type; + return td_ns::GetTypeid{}.get(); + } +}; + +template +class greater_strided_kernel; + +template +sycl::event + greater_strided_impl(sycl::queue &exec_q, + std::size_t nelems, + int nd, + const ssize_t *shape_and_strides, + const char *arg1_p, + ssize_t arg1_offset, + const char *arg2_p, + ssize_t arg2_offset, + char *res_p, + ssize_t res_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + return elementwise_common::binary_strided_impl< + argTy1, argTy2, GreaterOutputType, GreaterStridedFunctor, + greater_strided_kernel>(exec_q, nelems, nd, shape_and_strides, arg1_p, + arg1_offset, arg2_p, arg2_offset, res_p, + res_offset, depends, additional_depends); +} + +template +struct GreaterStridedFactory +{ + fnT get() + { + if constexpr (!GreaterOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = greater_strided_impl; + return fn; + } + } +}; + +} // namespace dpctl::tensor::kernels::greater diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/greater_equal.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/greater_equal.hpp new file mode 100644 index 000000000000..25c56d4d40a4 --- /dev/null +++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/greater_equal.hpp @@ -0,0 +1,317 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for elementwise evaluation of comparison of +/// tensor elements. +//===---------------------------------------------------------------------===// + +#pragma once +#include +#include +#include +#include +#include + +#include + +#include "vec_size_util.hpp" + +#include "utils/math_utils.hpp" +#include "utils/type_dispatch_building.hpp" +#include "utils/type_utils.hpp" + +#include "kernels/dpctl_tensor_types.hpp" +#include "kernels/elementwise_functions/common.hpp" + +namespace dpctl::tensor::kernels::greater_equal +{ + +using dpctl::tensor::ssize_t; +namespace td_ns = dpctl::tensor::type_dispatch; +namespace tu_ns = dpctl::tensor::type_utils; + +template +struct GreaterEqualFunctor +{ + static_assert(std::is_same_v); + + using supports_sg_loadstore = std::negation< + std::disjunction, tu_ns::is_complex>>; + using supports_vec = std::conjunction< + std::is_same, + std::negation, + tu_ns::is_complex>>>; + + resT operator()(const argT1 &in1, const argT2 &in2) const + { + if constexpr (tu_ns::is_complex::value || + tu_ns::is_complex::value) { + static_assert(std::is_same_v); + using dpctl::tensor::math_utils::greater_equal_complex; + return greater_equal_complex(in1, in2); + } + else { + if constexpr (std::is_integral_v && + std::is_integral_v && + std::is_signed_v != std::is_signed_v) { + if constexpr (std::is_signed_v && + !std::is_signed_v) { + return (in1 < 0) ? false : (static_cast(in1) >= in2); + } + else { + if constexpr (!std::is_signed_v && + std::is_signed_v) { + return (in2 < 0) ? true + : (in1 >= static_cast(in2)); + } + } + } + else { + return (in1 >= in2); + } + } + } + + template + sycl::vec + operator()(const sycl::vec &in1, + const sycl::vec &in2) const + { + + auto tmp = (in1 >= in2); + + if constexpr (std::is_same_v) { + return tmp; + } + else { + using dpctl::tensor::type_utils::vec_cast; + + return vec_cast( + tmp); + } + } +}; + +template +using GreaterEqualContigFunctor = elementwise_common::BinaryContigFunctor< + argT1, + argT2, + resT, + GreaterEqualFunctor, + vec_sz, + n_vecs, + enable_sg_loadstore>; + +template +using GreaterEqualStridedFunctor = elementwise_common::BinaryStridedFunctor< + argT1, + argT2, + resT, + IndexerT, + GreaterEqualFunctor>; + +template +struct GreaterEqualOutputType +{ + using value_type = typename std::disjunction< + td_ns::BinaryTypeMapResultEntry, + td_ns:: + BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns:: + BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns:: + BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns:: + BinaryTypeMapResultEntry, + td_ns:: + BinaryTypeMapResultEntry, + td_ns:: + BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + T2, + std::complex, + bool>, + td_ns::BinaryTypeMapResultEntry, + T2, + std::complex, + bool>, + td_ns::DefaultResultEntry>::result_type; + + static constexpr bool is_defined = !std::is_same_v; +}; + +namespace hyperparam_detail +{ + +namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; + +using vsu_ns::BinaryContigHyperparameterSetEntry; +using vsu_ns::ContigHyperparameterSetDefault; + +template +struct GreaterEqualContigHyperparameterSet +{ + using value_type = + typename std::disjunction>; + + constexpr static auto vec_sz = value_type::vec_sz; + constexpr static auto n_vecs = value_type::n_vecs; +}; + +} // end of namespace hyperparam_detail + +template +class greater_equal_contig_kernel; + +template +sycl::event + greater_equal_contig_impl(sycl::queue &exec_q, + std::size_t nelems, + const char *arg1_p, + ssize_t arg1_offset, + const char *arg2_p, + ssize_t arg2_offset, + char *res_p, + ssize_t res_offset, + const std::vector &depends = {}) +{ + using GreaterEqHS = + hyperparam_detail::GreaterEqualContigHyperparameterSet; + static constexpr std::uint8_t vec_sz = GreaterEqHS::vec_sz; + static constexpr std::uint8_t n_vecs = GreaterEqHS::n_vecs; + + return elementwise_common::binary_contig_impl< + argTy1, argTy2, GreaterEqualOutputType, GreaterEqualContigFunctor, + greater_equal_contig_kernel, vec_sz, n_vecs>( + exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p, + res_offset, depends); +} + +template +struct GreaterEqualContigFactory +{ + fnT get() + { + if constexpr (!GreaterEqualOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = greater_equal_contig_impl; + return fn; + } + } +}; + +template +struct GreaterEqualTypeMapFactory +{ + /*! @brief get typeid for output type of operator()>(x, y), always bool */ + std::enable_if_t::value, int> get() + { + using rT = typename GreaterEqualOutputType::value_type; + return td_ns::GetTypeid{}.get(); + } +}; + +template +class greater_equal_strided_kernel; + +template +sycl::event greater_equal_strided_impl( + sycl::queue &exec_q, + std::size_t nelems, + int nd, + const ssize_t *shape_and_strides, + const char *arg1_p, + ssize_t arg1_offset, + const char *arg2_p, + ssize_t arg2_offset, + char *res_p, + ssize_t res_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + return elementwise_common::binary_strided_impl< + argTy1, argTy2, GreaterEqualOutputType, GreaterEqualStridedFunctor, + greater_equal_strided_kernel>( + exec_q, nelems, nd, shape_and_strides, arg1_p, arg1_offset, arg2_p, + arg2_offset, res_p, res_offset, depends, additional_depends); +} + +template +struct GreaterEqualStridedFactory +{ + fnT get() + { + if constexpr (!GreaterEqualOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = greater_equal_strided_impl; + return fn; + } + } +}; + +} // namespace dpctl::tensor::kernels::greater_equal diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/hypot.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/hypot.hpp new file mode 100644 index 000000000000..438a5eea3ae8 --- /dev/null +++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/hypot.hpp @@ -0,0 +1,249 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for elementwise evaluation of HYPOT(x1, x2) +/// function. +//===---------------------------------------------------------------------===// + +#pragma once +#include +#include +#include +#include + +#include + +#include "vec_size_util.hpp" + +#include "utils/type_dispatch_building.hpp" +#include "utils/type_utils.hpp" + +#include "kernels/dpctl_tensor_types.hpp" +#include "kernels/elementwise_functions/common.hpp" + +namespace dpctl::tensor::kernels::hypot +{ + +using dpctl::tensor::ssize_t; +namespace td_ns = dpctl::tensor::type_dispatch; +namespace tu_ns = dpctl::tensor::type_utils; + +template +struct HypotFunctor +{ + + using supports_sg_loadstore = std::negation< + std::disjunction, tu_ns::is_complex>>; + using supports_vec = std::negation< + std::disjunction, tu_ns::is_complex>>; + + resT operator()(const argT1 &in1, const argT2 &in2) const + { + return sycl::hypot(in1, in2); + } + + template + sycl::vec + operator()(const sycl::vec &in1, + const sycl::vec &in2) const + { + auto res = sycl::hypot(in1, in2); + if constexpr (std::is_same_v) { + return res; + } + else { + using dpctl::tensor::type_utils::vec_cast; + + return vec_cast( + res); + } + } +}; + +template +using HypotContigFunctor = + elementwise_common::BinaryContigFunctor, + vec_sz, + n_vecs, + enable_sg_loadstore>; + +template +using HypotStridedFunctor = + elementwise_common::BinaryStridedFunctor>; + +template +struct HypotOutputType +{ + using value_type = typename std::disjunction< + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::DefaultResultEntry>::result_type; + + static constexpr bool is_defined = !std::is_same_v; +}; + +namespace hyperparam_detail +{ + +namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; + +using vsu_ns::BinaryContigHyperparameterSetEntry; +using vsu_ns::ContigHyperparameterSetDefault; + +template +struct HypotContigHyperparameterSet +{ + using value_type = + typename std::disjunction>; + + constexpr static auto vec_sz = value_type::vec_sz; + constexpr static auto n_vecs = value_type::n_vecs; +}; + +} // end of namespace hyperparam_detail + +template +class hypot_contig_kernel; + +template +sycl::event hypot_contig_impl(sycl::queue &exec_q, + std::size_t nelems, + const char *arg1_p, + ssize_t arg1_offset, + const char *arg2_p, + ssize_t arg2_offset, + char *res_p, + ssize_t res_offset, + const std::vector &depends = {}) +{ + using HypotHS = + hyperparam_detail::HypotContigHyperparameterSet; + static constexpr std::uint8_t vec_sz = HypotHS::vec_sz; + static constexpr std::uint8_t n_vecs = HypotHS::n_vecs; + + return elementwise_common::binary_contig_impl< + argTy1, argTy2, HypotOutputType, HypotContigFunctor, + hypot_contig_kernel, vec_sz, n_vecs>(exec_q, nelems, arg1_p, + arg1_offset, arg2_p, arg2_offset, + res_p, res_offset, depends); +} + +template +struct HypotContigFactory +{ + fnT get() + { + if constexpr (!HypotOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = hypot_contig_impl; + return fn; + } + } +}; + +template +struct HypotTypeMapFactory +{ + /*! @brief get typeid for output type of sycl::hypot(T1 x, T2 y) */ + std::enable_if_t::value, int> get() + { + using rT = typename HypotOutputType::value_type; + return td_ns::GetTypeid{}.get(); + } +}; + +template +class hypot_strided_kernel; + +template +sycl::event + hypot_strided_impl(sycl::queue &exec_q, + std::size_t nelems, + int nd, + const ssize_t *shape_and_strides, + const char *arg1_p, + ssize_t arg1_offset, + const char *arg2_p, + ssize_t arg2_offset, + char *res_p, + ssize_t res_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + return elementwise_common::binary_strided_impl< + argTy1, argTy2, HypotOutputType, HypotStridedFunctor, + hypot_strided_kernel>(exec_q, nelems, nd, shape_and_strides, arg1_p, + arg1_offset, arg2_p, arg2_offset, res_p, + res_offset, depends, additional_depends); +} + +template +struct HypotStridedFactory +{ + fnT get() + { + if constexpr (!HypotOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = hypot_strided_impl; + return fn; + } + } +}; + +} // namespace dpctl::tensor::kernels::hypot diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/imag.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/imag.hpp new file mode 100644 index 000000000000..667fb47efdc8 --- /dev/null +++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/imag.hpp @@ -0,0 +1,232 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for elementwise evaluation of IMAG(x) function. +//===---------------------------------------------------------------------===// + +#pragma once +#include +#include +#include +#include +#include + +#include + +#include "vec_size_util.hpp" + +#include "kernels/dpctl_tensor_types.hpp" +#include "kernels/elementwise_functions/common.hpp" + +#include "utils/type_dispatch_building.hpp" +#include "utils/type_utils.hpp" + +namespace dpctl::tensor::kernels::imag +{ + +using dpctl::tensor::ssize_t; +namespace td_ns = dpctl::tensor::type_dispatch; + +using dpctl::tensor::type_utils::is_complex; +using dpctl::tensor::type_utils::is_complex_v; + +template +struct ImagFunctor +{ + + // is function constant for given argT + using is_constant = + typename std::is_same, std::false_type>; + // constant value, if constant + static constexpr resT constant_value = resT{0}; + // is function defined for sycl::vec + using supports_vec = typename std::false_type; + // do both argTy and resTy support sugroup store/load operation + using supports_sg_loadstore = typename std::negation< + std::disjunction, is_complex>>; + + resT operator()(const argT &in) const + { + if constexpr (is_complex_v) { + return std::imag(in); + } + else { + static_assert(std::is_same_v); + return constant_value; + } + } +}; + +template +using ImagContigFunctor = + elementwise_common::UnaryContigFunctor, + vec_sz, + n_vecs, + enable_sg_loadstore>; + +template +using ImagStridedFunctor = elementwise_common:: + UnaryStridedFunctor>; + +template +struct ImagOutputType +{ + using value_type = typename std::disjunction< + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, float>, + td_ns::TypeMapResultEntry, double>, + td_ns::DefaultResultEntry>::result_type; + + static constexpr bool is_defined = !std::is_same_v; +}; + +namespace hyperparam_detail +{ + +namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; + +using vsu_ns::ContigHyperparameterSetDefault; +using vsu_ns::UnaryContigHyperparameterSetEntry; + +template +struct ImagContigHyperparameterSet +{ + using value_type = + typename std::disjunction>; + + constexpr static auto vec_sz = value_type::vec_sz; + constexpr static auto n_vecs = value_type::n_vecs; +}; + +} // end of namespace hyperparam_detail + +template +class imag_contig_kernel; + +template +sycl::event imag_contig_impl(sycl::queue &exec_q, + std::size_t nelems, + const char *arg_p, + char *res_p, + const std::vector &depends = {}) +{ + using ImagHS = hyperparam_detail::ImagContigHyperparameterSet; + static constexpr std::uint8_t vec_sz = ImagHS::vec_sz; + static constexpr std::uint8_t n_vecs = ImagHS::n_vecs; + + return elementwise_common::unary_contig_impl< + argTy, ImagOutputType, ImagContigFunctor, imag_contig_kernel, vec_sz, + n_vecs>(exec_q, nelems, arg_p, res_p, depends); +} + +template +struct ImagContigFactory +{ + fnT get() + { + if constexpr (!ImagOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = imag_contig_impl; + return fn; + } + } +}; + +template +struct ImagTypeMapFactory +{ + /*! @brief get typeid for output type of std::imag(T x) */ + std::enable_if_t::value, int> get() + { + using rT = typename ImagOutputType::value_type; + return td_ns::GetTypeid{}.get(); + } +}; + +template +class imag_strided_kernel; + +template +sycl::event + imag_strided_impl(sycl::queue &exec_q, + std::size_t nelems, + int nd, + const ssize_t *shape_and_strides, + const char *arg_p, + ssize_t arg_offset, + char *res_p, + ssize_t res_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + return elementwise_common::unary_strided_impl< + argTy, ImagOutputType, ImagStridedFunctor, imag_strided_kernel>( + exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p, + res_offset, depends, additional_depends); +} + +template +struct ImagStridedFactory +{ + fnT get() + { + if constexpr (!ImagOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = imag_strided_impl; + return fn; + } + } +}; + +} // namespace dpctl::tensor::kernels::imag diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/isfinite.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/isfinite.hpp new file mode 100644 index 000000000000..8eb435c089d8 --- /dev/null +++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/isfinite.hpp @@ -0,0 +1,228 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for elementwise evaluation of ISFINITE(x) +/// function that tests whether a tensor element is finite. +//===---------------------------------------------------------------------===// + +#pragma once +#include +#include +#include +#include +#include +#include + +#include + +#include "vec_size_util.hpp" + +#include "kernels/dpctl_tensor_types.hpp" +#include "kernels/elementwise_functions/common.hpp" + +#include "utils/type_dispatch_building.hpp" +#include "utils/type_utils.hpp" + +namespace dpctl::tensor::kernels::isfinite +{ + +using dpctl::tensor::ssize_t; +namespace td_ns = dpctl::tensor::type_dispatch; + +using dpctl::tensor::type_utils::is_complex; +using dpctl::tensor::type_utils::vec_cast; + +template +struct IsFiniteFunctor +{ + static_assert(std::is_same_v); + + /* + std::is_same::value || + std::is_integral::value + */ + using is_constant = typename std::disjunction, + std::is_integral>; + static constexpr resT constant_value = true; + using supports_vec = typename std::false_type; + using supports_sg_loadstore = typename std::negation< + std::disjunction, is_complex>>; + + resT operator()(const argT &in) const + { + if constexpr (is_complex::value) { + const bool real_isfinite = std::isfinite(std::real(in)); + const bool imag_isfinite = std::isfinite(std::imag(in)); + return (real_isfinite && imag_isfinite); + } + else if constexpr (std::is_same::value || + std::is_integral::value) { + return constant_value; + } + else if constexpr (std::is_same_v) { + return sycl::isfinite(in); + } + else { + return std::isfinite(in); + } + } + + template + sycl::vec operator()(const sycl::vec &in) const + { + auto const &res_vec = sycl::isfinite(in); + + using deducedT = typename std::remove_cv_t< + std::remove_reference_t>::element_type; + + return vec_cast(res_vec); + } +}; + +template +using IsFiniteContigFunctor = + elementwise_common::UnaryContigFunctor, + vec_sz, + n_vecs, + enable_sg_loadstore>; + +template +using IsFiniteStridedFunctor = elementwise_common:: + UnaryStridedFunctor>; + +template +struct IsFiniteOutputType +{ + using value_type = bool; +}; + +namespace hyperparam_detail +{ + +namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; + +using vsu_ns::ContigHyperparameterSetDefault; +using vsu_ns::UnaryContigHyperparameterSetEntry; + +template +struct IsFiniteContigHyperparameterSet +{ + using value_type = + typename std::disjunction>; + + constexpr static auto vec_sz = value_type::vec_sz; + constexpr static auto n_vecs = value_type::n_vecs; +}; + +} // end of namespace hyperparam_detail + +template +class isfinite_contig_kernel; + +template +sycl::event isfinite_contig_impl(sycl::queue &exec_q, + std::size_t nelems, + const char *arg_p, + char *res_p, + const std::vector &depends = {}) +{ + using IsFiniteHS = + hyperparam_detail::IsFiniteContigHyperparameterSet; + static constexpr std::uint8_t vec_sz = IsFiniteHS::vec_sz; + static constexpr std::uint8_t n_vecs = IsFiniteHS::n_vecs; + + return elementwise_common::unary_contig_impl< + argTy, IsFiniteOutputType, IsFiniteContigFunctor, + isfinite_contig_kernel, vec_sz, n_vecs>(exec_q, nelems, arg_p, res_p, + depends); +} + +template +struct IsFiniteContigFactory +{ + fnT get() + { + fnT fn = isfinite_contig_impl; + return fn; + } +}; + +template +struct IsFiniteTypeMapFactory +{ + /*! @brief get typeid for output type of sycl::isfinite(T x) */ + std::enable_if_t::value, int> get() + { + using rT = typename IsFiniteOutputType::value_type; + return td_ns::GetTypeid{}.get(); + } +}; + +template +class isfinite_strided_kernel; + +template +sycl::event + isfinite_strided_impl(sycl::queue &exec_q, + std::size_t nelems, + int nd, + const ssize_t *shape_and_strides, + const char *arg_p, + ssize_t arg_offset, + char *res_p, + ssize_t res_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + return elementwise_common::unary_strided_impl( + exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p, + res_offset, depends, additional_depends); +} + +template +struct IsFiniteStridedFactory +{ + fnT get() + { + fnT fn = isfinite_strided_impl; + return fn; + } +}; + +} // namespace dpctl::tensor::kernels::isfinite diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/isinf.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/isinf.hpp new file mode 100644 index 000000000000..b7d85e21a1f2 --- /dev/null +++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/isinf.hpp @@ -0,0 +1,223 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for elementwise evaluation of ISINF(x) +/// function that tests whether a tensor element is an infinity. +//===---------------------------------------------------------------------===// + +#pragma once +#include +#include +#include +#include +#include +#include + +#include + +#include "vec_size_util.hpp" + +#include "kernels/dpctl_tensor_types.hpp" +#include "kernels/elementwise_functions/common.hpp" + +#include "utils/type_dispatch_building.hpp" +#include "utils/type_utils.hpp" + +namespace dpctl::tensor::kernels::isinf +{ + +using dpctl::tensor::ssize_t; +namespace td_ns = dpctl::tensor::type_dispatch; + +using dpctl::tensor::type_utils::is_complex; +using dpctl::tensor::type_utils::vec_cast; + +template +struct IsInfFunctor +{ + static_assert(std::is_same_v); + + using is_constant = typename std::disjunction, + std::is_integral>; + static constexpr resT constant_value = false; + using supports_vec = + typename std::disjunction, + std::is_floating_point>; + using supports_sg_loadstore = typename std::negation< + std::disjunction, is_complex>>; + + resT operator()(const argT &in) const + { + if constexpr (is_complex::value) { + const bool real_isinf = std::isinf(std::real(in)); + const bool imag_isinf = std::isinf(std::imag(in)); + return (real_isinf || imag_isinf); + } + else if constexpr (std::is_same::value || + std::is_integral::value) { + return constant_value; + } + else if constexpr (std::is_same_v) { + return sycl::isinf(in); + } + else { + return std::isinf(in); + } + } + + template + sycl::vec operator()(const sycl::vec &in) const + { + auto const &res_vec = sycl::isinf(in); + + using deducedT = typename std::remove_cv_t< + std::remove_reference_t>::element_type; + + return vec_cast(res_vec); + } +}; + +template +using IsInfContigFunctor = + elementwise_common::UnaryContigFunctor, + vec_sz, + n_vecs, + enable_sg_loadstore>; + +template +using IsInfStridedFunctor = elementwise_common:: + UnaryStridedFunctor>; + +template +struct IsInfOutputType +{ + using value_type = bool; +}; + +namespace hyperparam_detail +{ + +namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; + +using vsu_ns::ContigHyperparameterSetDefault; +using vsu_ns::UnaryContigHyperparameterSetEntry; + +template +struct IsInfContigHyperparameterSet +{ + using value_type = + typename std::disjunction>; + + constexpr static auto vec_sz = value_type::vec_sz; + constexpr static auto n_vecs = value_type::n_vecs; +}; + +} // namespace hyperparam_detail + +template +class isinf_contig_kernel; + +template +sycl::event isinf_contig_impl(sycl::queue &exec_q, + std::size_t nelems, + const char *arg_p, + char *res_p, + const std::vector &depends = {}) +{ + using IsInfHS = hyperparam_detail::IsInfContigHyperparameterSet; + static constexpr std::uint8_t vec_sz = IsInfHS::vec_sz; + static constexpr std::uint8_t n_vecs = IsInfHS::n_vecs; + + return elementwise_common::unary_contig_impl< + argTy, IsInfOutputType, IsInfContigFunctor, isinf_contig_kernel, vec_sz, + n_vecs>(exec_q, nelems, arg_p, res_p, depends); +} + +template +struct IsInfContigFactory +{ + fnT get() + { + fnT fn = isinf_contig_impl; + return fn; + } +}; + +template +struct IsInfTypeMapFactory +{ + /*! @brief get typeid for output type of sycl::isinf(T x) */ + std::enable_if_t::value, int> get() + { + using rT = typename IsInfOutputType::value_type; + return td_ns::GetTypeid{}.get(); + } +}; + +template +class isinf_strided_kernel; + +template +sycl::event + isinf_strided_impl(sycl::queue &exec_q, + std::size_t nelems, + int nd, + const ssize_t *shape_and_strides, + const char *arg_p, + ssize_t arg_offset, + char *res_p, + ssize_t res_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + return elementwise_common::unary_strided_impl< + argTy, IsInfOutputType, IsInfStridedFunctor, isinf_strided_kernel>( + exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p, + res_offset, depends, additional_depends); +} + +template +struct IsInfStridedFactory +{ + fnT get() + { + fnT fn = isinf_strided_impl; + return fn; + } +}; + +} // namespace dpctl::tensor::kernels::isinf diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/isnan.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/isnan.hpp new file mode 100644 index 000000000000..cad2d2239de0 --- /dev/null +++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/isnan.hpp @@ -0,0 +1,221 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for elementwise evaluation of ISNAN(x) +/// function that tests whether a tensor element is a NaN. +//===---------------------------------------------------------------------===// + +#pragma once +#include +#include +#include +#include +#include + +#include + +#include "vec_size_util.hpp" + +#include "kernels/dpctl_tensor_types.hpp" +#include "kernels/elementwise_functions/common.hpp" + +#include "utils/type_dispatch_building.hpp" +#include "utils/type_utils.hpp" + +namespace dpctl::tensor::kernels::isnan +{ + +using dpctl::tensor::ssize_t; +namespace td_ns = dpctl::tensor::type_dispatch; + +using dpctl::tensor::type_utils::is_complex; +using dpctl::tensor::type_utils::vec_cast; + +template +struct IsNanFunctor +{ + static_assert(std::is_same_v); + + /* + std::is_same::value || + std::is_integral::value + */ + using is_constant = typename std::disjunction, + std::is_integral>; + static constexpr resT constant_value = false; + using supports_vec = typename std::true_type; + using supports_sg_loadstore = typename std::negation< + std::disjunction, is_complex>>; + + resT operator()(const argT &in) const + { + if constexpr (is_complex::value) { + const bool real_isnan = sycl::isnan(std::real(in)); + const bool imag_isnan = sycl::isnan(std::imag(in)); + return (real_isnan || imag_isnan); + } + else if constexpr (std::is_same::value || + std::is_integral::value) { + return constant_value; + } + else { + return sycl::isnan(in); + } + } + + template + sycl::vec operator()(const sycl::vec &in) const + { + auto const &res_vec = sycl::isnan(in); + + using deducedT = typename std::remove_cv_t< + std::remove_reference_t>::element_type; + + return vec_cast(res_vec); + } +}; + +template +using IsNanContigFunctor = + elementwise_common::UnaryContigFunctor, + vec_sz, + n_vecs, + enable_sg_loadstore>; + +template +using IsNanStridedFunctor = elementwise_common:: + UnaryStridedFunctor>; + +template +struct IsNanOutputType +{ + using value_type = bool; +}; + +namespace hyperparam_detail +{ + +namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; + +using vsu_ns::ContigHyperparameterSetDefault; +using vsu_ns::UnaryContigHyperparameterSetEntry; + +template +struct IsNanContigHyperparameterSet +{ + using value_type = + typename std::disjunction>; + + constexpr static auto vec_sz = value_type::vec_sz; + constexpr static auto n_vecs = value_type::n_vecs; +}; + +} // end of namespace hyperparam_detail + +template +class isnan_contig_kernel; + +template +sycl::event isnan_contig_impl(sycl::queue &exec_q, + std::size_t nelems, + const char *arg_p, + char *res_p, + const std::vector &depends = {}) +{ + using IsNanHS = hyperparam_detail::IsNanContigHyperparameterSet; + static constexpr std::uint8_t vec_sz = IsNanHS::vec_sz; + static constexpr std::uint8_t n_vecs = IsNanHS::n_vecs; + + return elementwise_common::unary_contig_impl< + argTy, IsNanOutputType, IsNanContigFunctor, isnan_contig_kernel, vec_sz, + n_vecs>(exec_q, nelems, arg_p, res_p, depends); +} + +template +struct IsNanContigFactory +{ + fnT get() + { + fnT fn = isnan_contig_impl; + return fn; + } +}; + +template +struct IsNanTypeMapFactory +{ + /*! @brief get typeid for output type of sycl::isnan(T x) */ + std::enable_if_t::value, int> get() + { + using rT = typename IsNanOutputType::value_type; + return td_ns::GetTypeid{}.get(); + } +}; + +template +class isnan_strided_kernel; + +template +sycl::event + isnan_strided_impl(sycl::queue &exec_q, + std::size_t nelems, + int nd, + const ssize_t *shape_and_strides, + const char *arg_p, + ssize_t arg_offset, + char *res_p, + ssize_t res_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + return elementwise_common::unary_strided_impl< + argTy, IsNanOutputType, IsNanStridedFunctor, isnan_strided_kernel>( + exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p, + res_offset, depends, additional_depends); +} + +template +struct IsNanStridedFactory +{ + fnT get() + { + fnT fn = isnan_strided_impl; + return fn; + } +}; + +} // namespace dpctl::tensor::kernels::isnan diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/less.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/less.hpp new file mode 100644 index 000000000000..19077936372e --- /dev/null +++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/less.hpp @@ -0,0 +1,314 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for elementwise evaluation of comparison of +/// tensor elements. +//===---------------------------------------------------------------------===// + +#pragma once +#include +#include +#include +#include +#include + +#include + +#include "vec_size_util.hpp" + +#include "utils/math_utils.hpp" +#include "utils/type_dispatch_building.hpp" +#include "utils/type_utils.hpp" + +#include "kernels/dpctl_tensor_types.hpp" +#include "kernels/elementwise_functions/common.hpp" + +namespace dpctl::tensor::kernels::less +{ + +using dpctl::tensor::ssize_t; +namespace td_ns = dpctl::tensor::type_dispatch; +namespace tu_ns = dpctl::tensor::type_utils; + +template +struct LessFunctor +{ + static_assert(std::is_same_v); + + using supports_sg_loadstore = std::negation< + std::disjunction, tu_ns::is_complex>>; + using supports_vec = std::conjunction< + std::is_same, + std::negation, + tu_ns::is_complex>>>; + + resT operator()(const argT1 &in1, const argT2 &in2) const + { + if constexpr (tu_ns::is_complex::value || + tu_ns::is_complex::value) { + static_assert(std::is_same_v); + using dpctl::tensor::math_utils::less_complex; + return less_complex(in1, in2); + } + else { + if constexpr (std::is_integral_v && + std::is_integral_v && + std::is_signed_v != std::is_signed_v) { + if constexpr (std::is_signed_v && + !std::is_signed_v) { + return (in1 < 0) ? true : (static_cast(in1) < in2); + } + else { + if constexpr (!std::is_signed_v && + std::is_signed_v) { + return (in2 < 0) ? false + : (in1 < static_cast(in2)); + } + } + } + else { + return (in1 < in2); + } + } + } + + template + sycl::vec + operator()(const sycl::vec &in1, + const sycl::vec &in2) const + { + auto tmp = (in1 < in2); + + if constexpr (std::is_same_v) { + return tmp; + } + else { + using dpctl::tensor::type_utils::vec_cast; + + return vec_cast( + tmp); + } + } +}; + +template +using LessContigFunctor = + elementwise_common::BinaryContigFunctor, + vec_sz, + n_vecs, + enable_sg_loadstore>; + +template +using LessStridedFunctor = + elementwise_common::BinaryStridedFunctor>; + +template +struct LessOutputType +{ + using value_type = typename std::disjunction< + td_ns::BinaryTypeMapResultEntry, + td_ns:: + BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns:: + BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns:: + BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns:: + BinaryTypeMapResultEntry, + td_ns:: + BinaryTypeMapResultEntry, + td_ns:: + BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + T2, + std::complex, + bool>, + td_ns::BinaryTypeMapResultEntry, + T2, + std::complex, + bool>, + td_ns::DefaultResultEntry>::result_type; + + static constexpr bool is_defined = !std::is_same_v; +}; + +namespace hyperparam_detail +{ + +namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; + +using vsu_ns::BinaryContigHyperparameterSetEntry; +using vsu_ns::ContigHyperparameterSetDefault; + +template +struct LessContigHyperparameterSet +{ + using value_type = + typename std::disjunction>; + + constexpr static auto vec_sz = value_type::vec_sz; + constexpr static auto n_vecs = value_type::n_vecs; +}; + +} // end of namespace hyperparam_detail + +template +class less_contig_kernel; + +template +sycl::event less_contig_impl(sycl::queue &exec_q, + std::size_t nelems, + const char *arg1_p, + ssize_t arg1_offset, + const char *arg2_p, + ssize_t arg2_offset, + char *res_p, + ssize_t res_offset, + const std::vector &depends = {}) +{ + using LessHS = + hyperparam_detail::LessContigHyperparameterSet; + static constexpr std::uint8_t vec_sz = LessHS::vec_sz; + static constexpr std::uint8_t n_vecs = LessHS::n_vecs; + + return elementwise_common::binary_contig_impl< + argTy1, argTy2, LessOutputType, LessContigFunctor, less_contig_kernel, + vec_sz, n_vecs>(exec_q, nelems, arg1_p, arg1_offset, arg2_p, + arg2_offset, res_p, res_offset, depends); +} + +template +struct LessContigFactory +{ + fnT get() + { + if constexpr (!LessOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = less_contig_impl; + return fn; + } + } +}; + +template +struct LessTypeMapFactory +{ + /*! @brief get typeid for output type of operator()>(x, y), always bool */ + std::enable_if_t::value, int> get() + { + using rT = typename LessOutputType::value_type; + return td_ns::GetTypeid{}.get(); + } +}; + +template +class less_strided_kernel; + +template +sycl::event + less_strided_impl(sycl::queue &exec_q, + std::size_t nelems, + int nd, + const ssize_t *shape_and_strides, + const char *arg1_p, + ssize_t arg1_offset, + const char *arg2_p, + ssize_t arg2_offset, + char *res_p, + ssize_t res_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + return elementwise_common::binary_strided_impl< + argTy1, argTy2, LessOutputType, LessStridedFunctor, + less_strided_kernel>(exec_q, nelems, nd, shape_and_strides, arg1_p, + arg1_offset, arg2_p, arg2_offset, res_p, + res_offset, depends, additional_depends); +} + +template +struct LessStridedFactory +{ + fnT get() + { + if constexpr (!LessOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = less_strided_impl; + return fn; + } + } +}; + +} // namespace dpctl::tensor::kernels::less diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/less_equal.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/less_equal.hpp new file mode 100644 index 000000000000..a0b23693e70d --- /dev/null +++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/less_equal.hpp @@ -0,0 +1,316 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for elementwise evaluation of comparison of +/// tensor elements. +//===---------------------------------------------------------------------===// + +#pragma once +#include +#include +#include +#include +#include + +#include + +#include "vec_size_util.hpp" + +#include "utils/math_utils.hpp" +#include "utils/type_dispatch_building.hpp" +#include "utils/type_utils.hpp" + +#include "kernels/dpctl_tensor_types.hpp" +#include "kernels/elementwise_functions/common.hpp" + +namespace dpctl::tensor::kernels::less_equal +{ + +using dpctl::tensor::ssize_t; +namespace td_ns = dpctl::tensor::type_dispatch; +namespace tu_ns = dpctl::tensor::type_utils; + +template +struct LessEqualFunctor +{ + static_assert(std::is_same_v); + + using supports_sg_loadstore = std::negation< + std::disjunction, tu_ns::is_complex>>; + using supports_vec = std::conjunction< + std::is_same, + std::negation, + tu_ns::is_complex>>>; + + resT operator()(const argT1 &in1, const argT2 &in2) const + { + if constexpr (tu_ns::is_complex::value || + tu_ns::is_complex::value) { + static_assert(std::is_same_v); + using dpctl::tensor::math_utils::less_equal_complex; + return less_equal_complex(in1, in2); + } + else { + if constexpr (std::is_integral_v && + std::is_integral_v && + std::is_signed_v != std::is_signed_v) { + if constexpr (std::is_signed_v && + !std::is_signed_v) { + return (in1 < 0) ? true : (static_cast(in1) <= in2); + } + else { + if constexpr (!std::is_signed_v && + std::is_signed_v) { + return (in2 < 0) ? false + : (in1 <= static_cast(in2)); + } + } + } + else { + return (in1 <= in2); + } + } + } + + template + sycl::vec + operator()(const sycl::vec &in1, + const sycl::vec &in2) const + { + + auto tmp = (in1 <= in2); + + if constexpr (std::is_same_v) { + return tmp; + } + else { + using dpctl::tensor::type_utils::vec_cast; + + return vec_cast( + tmp); + } + } +}; + +template +using LessEqualContigFunctor = elementwise_common::BinaryContigFunctor< + argT1, + argT2, + resT, + LessEqualFunctor, + vec_sz, + n_vecs, + enable_sg_loadstore>; + +template +using LessEqualStridedFunctor = elementwise_common::BinaryStridedFunctor< + argT1, + argT2, + resT, + IndexerT, + LessEqualFunctor>; + +template +struct LessEqualOutputType +{ + using value_type = typename std::disjunction< + td_ns::BinaryTypeMapResultEntry, + td_ns:: + BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns:: + BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns:: + BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns:: + BinaryTypeMapResultEntry, + td_ns:: + BinaryTypeMapResultEntry, + td_ns:: + BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + T2, + std::complex, + bool>, + td_ns::BinaryTypeMapResultEntry, + T2, + std::complex, + bool>, + td_ns::DefaultResultEntry>::result_type; + + static constexpr bool is_defined = !std::is_same_v; +}; + +namespace hyperparam_detail +{ + +namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; + +using vsu_ns::BinaryContigHyperparameterSetEntry; +using vsu_ns::ContigHyperparameterSetDefault; + +template +struct LessEqualContigHyperparameterSet +{ + using value_type = + typename std::disjunction>; + + constexpr static auto vec_sz = value_type::vec_sz; + constexpr static auto n_vecs = value_type::n_vecs; +}; + +} // end of namespace hyperparam_detail + +template +class less_equal_contig_kernel; + +template +sycl::event less_equal_contig_impl(sycl::queue &exec_q, + std::size_t nelems, + const char *arg1_p, + ssize_t arg1_offset, + const char *arg2_p, + ssize_t arg2_offset, + char *res_p, + ssize_t res_offset, + const std::vector &depends = {}) +{ + using LessEqHS = + hyperparam_detail::LessEqualContigHyperparameterSet; + static constexpr std::uint8_t vec_sz = LessEqHS::vec_sz; + static constexpr std::uint8_t n_vecs = LessEqHS::n_vecs; + + return elementwise_common::binary_contig_impl< + argTy1, argTy2, LessEqualOutputType, LessEqualContigFunctor, + less_equal_contig_kernel, vec_sz, n_vecs>( + exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p, + res_offset, depends); +} + +template +struct LessEqualContigFactory +{ + fnT get() + { + if constexpr (!LessEqualOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = less_equal_contig_impl; + return fn; + } + } +}; + +template +struct LessEqualTypeMapFactory +{ + /*! @brief get typeid for output type of operator()>(x, y), always bool */ + std::enable_if_t::value, int> get() + { + using rT = typename LessEqualOutputType::value_type; + return td_ns::GetTypeid{}.get(); + } +}; + +template +class less_equal_strided_kernel; + +template +sycl::event + less_equal_strided_impl(sycl::queue &exec_q, + std::size_t nelems, + int nd, + const ssize_t *shape_and_strides, + const char *arg1_p, + ssize_t arg1_offset, + const char *arg2_p, + ssize_t arg2_offset, + char *res_p, + ssize_t res_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + return elementwise_common::binary_strided_impl< + argTy1, argTy2, LessEqualOutputType, LessEqualStridedFunctor, + less_equal_strided_kernel>( + exec_q, nelems, nd, shape_and_strides, arg1_p, arg1_offset, arg2_p, + arg2_offset, res_p, res_offset, depends, additional_depends); +} + +template +struct LessEqualStridedFactory +{ + fnT get() + { + if constexpr (!LessEqualOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = less_equal_strided_impl; + return fn; + } + } +}; + +} // namespace dpctl::tensor::kernels::less_equal diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/log.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/log.hpp new file mode 100644 index 000000000000..05e5048f65a7 --- /dev/null +++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/log.hpp @@ -0,0 +1,222 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for elementwise evaluation of LOG(x) function. +//===---------------------------------------------------------------------===// + +#pragma once +#include +#include +#include +#include +#include + +#include + +#include "sycl_complex.hpp" +#include "vec_size_util.hpp" + +#include "kernels/dpctl_tensor_types.hpp" +#include "kernels/elementwise_functions/common.hpp" + +#include "utils/type_dispatch_building.hpp" +#include "utils/type_utils.hpp" + +namespace dpctl::tensor::kernels::log +{ + +using dpctl::tensor::ssize_t; +namespace td_ns = dpctl::tensor::type_dispatch; + +using dpctl::tensor::type_utils::is_complex; + +template +struct LogFunctor +{ + + // is function constant for given argT + using is_constant = typename std::false_type; + // constant value, if constant + // constexpr resT constant_value = resT{}; + // is function defined for sycl::vec + using supports_vec = typename std::false_type; + // do both argTy and resTy support sugroup store/load operation + using supports_sg_loadstore = typename std::negation< + std::disjunction, is_complex>>; + + resT operator()(const argT &in) const + { + if constexpr (is_complex::value) { + using realT = typename argT::value_type; + return exprm_ns::log(exprm_ns::complex(in)); // log(in); + } + else { + return sycl::log(in); + } + } +}; + +template +using LogContigFunctor = + elementwise_common::UnaryContigFunctor, + vec_sz, + n_vecs, + enable_sg_loadstore>; + +template +using LogStridedFunctor = elementwise_common:: + UnaryStridedFunctor>; + +template +struct LogOutputType +{ + using value_type = typename std::disjunction< + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, std::complex>, + td_ns:: + TypeMapResultEntry, std::complex>, + td_ns::DefaultResultEntry>::result_type; + + static constexpr bool is_defined = !std::is_same_v; +}; + +namespace hyperparam_detail +{ + +namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; + +using vsu_ns::ContigHyperparameterSetDefault; +using vsu_ns::UnaryContigHyperparameterSetEntry; + +template +struct LogContigHyperparameterSet +{ + using value_type = + typename std::disjunction>; + + constexpr static auto vec_sz = value_type::vec_sz; + constexpr static auto n_vecs = value_type::n_vecs; +}; + +} // end of namespace hyperparam_detail + +template +class log_contig_kernel; + +template +sycl::event log_contig_impl(sycl::queue &exec_q, + std::size_t nelems, + const char *arg_p, + char *res_p, + const std::vector &depends = {}) +{ + using LogHS = hyperparam_detail::LogContigHyperparameterSet; + static constexpr std::uint8_t vec_sz = LogHS::vec_sz; + static constexpr std::uint8_t n_vecs = LogHS::n_vecs; + + return elementwise_common::unary_contig_impl< + argTy, LogOutputType, LogContigFunctor, log_contig_kernel, vec_sz, + n_vecs>(exec_q, nelems, arg_p, res_p, depends); +} + +template +struct LogContigFactory +{ + fnT get() + { + if constexpr (!LogOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = log_contig_impl; + return fn; + } + } +}; + +template +struct LogTypeMapFactory +{ + /*! @brief get typeid for output type of sycl::log(T x) */ + std::enable_if_t::value, int> get() + { + using rT = typename LogOutputType::value_type; + return td_ns::GetTypeid{}.get(); + } +}; + +template +class log_strided_kernel; + +template +sycl::event log_strided_impl(sycl::queue &exec_q, + std::size_t nelems, + int nd, + const ssize_t *shape_and_strides, + const char *arg_p, + ssize_t arg_offset, + char *res_p, + ssize_t res_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + return elementwise_common::unary_strided_impl< + argTy, LogOutputType, LogStridedFunctor, log_strided_kernel>( + exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p, + res_offset, depends, additional_depends); +} + +template +struct LogStridedFactory +{ + fnT get() + { + if constexpr (!LogOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = log_strided_impl; + return fn; + } + } +}; + +} // namespace dpctl::tensor::kernels::log diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/log10.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/log10.hpp new file mode 100644 index 000000000000..8ddb701ea622 --- /dev/null +++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/log10.hpp @@ -0,0 +1,240 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for elementwise evaluation of LOG10(x) function. +//===---------------------------------------------------------------------===// + +#pragma once +#include +#include +#include +#include +#include +#include + +#include "sycl_complex.hpp" +#include "vec_size_util.hpp" + +#include "kernels/dpctl_tensor_types.hpp" +#include "kernels/elementwise_functions/common.hpp" + +#include "utils/type_dispatch_building.hpp" +#include "utils/type_utils.hpp" + +namespace dpctl::tensor::kernels::log10 +{ + +using dpctl::tensor::ssize_t; +namespace td_ns = dpctl::tensor::type_dispatch; + +using dpctl::tensor::type_utils::is_complex; +using dpctl::tensor::type_utils::vec_cast; + +template +struct Log10Functor +{ + + // is function constant for given argT + using is_constant = typename std::false_type; + // constant value, if constant + // constexpr resT constant_value = resT{}; + // is function defined for sycl::vec + using supports_vec = typename std::negation< + std::disjunction, is_complex>>; + // do both argTy and resTy support sugroup store/load operation + using supports_sg_loadstore = typename std::negation< + std::disjunction, is_complex>>; + + resT operator()(const argT &in) const + { + if constexpr (is_complex::value) { + using realT = typename argT::value_type; + // return (log(in) / log(realT{10})); + return exprm_ns::log(exprm_ns::complex(in)) / + sycl::log(realT{10}); + } + else { + return sycl::log10(in); + } + } + + template + sycl::vec operator()(const sycl::vec &in) const + { + auto const &res_vec = sycl::log10(in); + using deducedT = typename std::remove_cv_t< + std::remove_reference_t>::element_type; + if constexpr (std::is_same_v) { + return res_vec; + } + else { + return vec_cast(res_vec); + } + } +}; + +template +using Log10ContigFunctor = + elementwise_common::UnaryContigFunctor, + vec_sz, + n_vecs, + enable_sg_loadstore>; + +template +using Log10StridedFunctor = elementwise_common:: + UnaryStridedFunctor>; + +template +struct Log10OutputType +{ + using value_type = typename std::disjunction< + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, std::complex>, + td_ns:: + TypeMapResultEntry, std::complex>, + td_ns::DefaultResultEntry>::result_type; + + static constexpr bool is_defined = !std::is_same_v; +}; + +namespace hyperparam_detail +{ + +namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; + +using vsu_ns::ContigHyperparameterSetDefault; +using vsu_ns::UnaryContigHyperparameterSetEntry; + +template +struct Log10ContigHyperparameterSet +{ + using value_type = + typename std::disjunction>; + + constexpr static auto vec_sz = value_type::vec_sz; + constexpr static auto n_vecs = value_type::n_vecs; +}; + +} // end of namespace hyperparam_detail + +template +class log10_contig_kernel; + +template +sycl::event log10_contig_impl(sycl::queue &exec_q, + std::size_t nelems, + const char *arg_p, + char *res_p, + const std::vector &depends = {}) +{ + using Log10HS = hyperparam_detail::Log10ContigHyperparameterSet; + static constexpr std::uint8_t vec_sz = Log10HS::vec_sz; + static constexpr std::uint8_t n_vecs = Log10HS::n_vecs; + + return elementwise_common::unary_contig_impl< + argTy, Log10OutputType, Log10ContigFunctor, log10_contig_kernel, vec_sz, + n_vecs>(exec_q, nelems, arg_p, res_p, depends); +} + +template +struct Log10ContigFactory +{ + fnT get() + { + if constexpr (!Log10OutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = log10_contig_impl; + return fn; + } + } +}; + +template +struct Log10TypeMapFactory +{ + /*! @brief get typeid for output type of sycl::log10(T x) */ + std::enable_if_t::value, int> get() + { + using rT = typename Log10OutputType::value_type; + return td_ns::GetTypeid{}.get(); + } +}; + +template +class log10_strided_kernel; + +template +sycl::event + log10_strided_impl(sycl::queue &exec_q, + std::size_t nelems, + int nd, + const ssize_t *shape_and_strides, + const char *arg_p, + ssize_t arg_offset, + char *res_p, + ssize_t res_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + return elementwise_common::unary_strided_impl< + argTy, Log10OutputType, Log10StridedFunctor, log10_strided_kernel>( + exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p, + res_offset, depends, additional_depends); +} + +template +struct Log10StridedFactory +{ + fnT get() + { + if constexpr (!Log10OutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = log10_strided_impl; + return fn; + } + } +}; + +} // namespace dpctl::tensor::kernels::log10 diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/log1p.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/log1p.hpp new file mode 100644 index 000000000000..8365932aead7 --- /dev/null +++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/log1p.hpp @@ -0,0 +1,248 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for elementwise evaluation of LOG1P(x) function. +//===---------------------------------------------------------------------===// + +#pragma once +#include +#include +#include +#include +#include +#include + +#include + +#include "vec_size_util.hpp" + +#include "kernels/dpctl_tensor_types.hpp" +#include "kernels/elementwise_functions/common.hpp" + +#include "utils/type_dispatch_building.hpp" +#include "utils/type_utils.hpp" + +namespace dpctl::tensor::kernels::log1p +{ + +using dpctl::tensor::ssize_t; +namespace td_ns = dpctl::tensor::type_dispatch; + +using dpctl::tensor::type_utils::is_complex; + +// TODO: evaluate precision against alternatives +template +struct Log1pFunctor +{ + + // is function constant for given argT + using is_constant = typename std::false_type; + // constant value, if constant + // constexpr resT constant_value = resT{}; + // is function defined for sycl::vec + using supports_vec = typename std::false_type; + // do both argTy and resTy support sugroup store/load operation + using supports_sg_loadstore = typename std::negation< + std::disjunction, is_complex>>; + + resT operator()(const argT &in) const + { + if constexpr (is_complex::value) { + // log1p(z) = ln((x + 1) + yI) + // = ln(|(x + 1) + yi|) + // + I * atan2(y, x + 1) + // = ln(sqrt((x + 1)^2 + y^2)) + // + I *atan2(y, x + 1) + // = log1p(x^2 + 2x + y^2) / 2 + // + I * atan2(y, x + 1) + using realT = typename argT::value_type; + const realT x = std::real(in); + const realT y = std::imag(in); + + // imaginary part of result + const realT res_im = sycl::atan2(y, x + 1); + + if (std::max(sycl::fabs(x), sycl::fabs(y)) < realT{.1}) { + const realT v = x * (2 + x) + y * y; + return resT{sycl::log1p(v) / 2, res_im}; + } + else { + // when not close to zero, + // prevent overflow + const realT m = sycl::hypot(x + 1, y); + return resT{sycl::log(m), res_im}; + } + } + else { + static_assert(std::is_floating_point_v || + std::is_same_v); + return sycl::log1p(in); + } + } +}; + +template +using Log1pContigFunctor = + elementwise_common::UnaryContigFunctor, + vec_sz, + n_vecs, + enable_sg_loadstore>; + +template +using Log1pStridedFunctor = elementwise_common:: + UnaryStridedFunctor>; + +template +struct Log1pOutputType +{ + using value_type = typename std::disjunction< + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, std::complex>, + td_ns:: + TypeMapResultEntry, std::complex>, + td_ns::DefaultResultEntry>::result_type; + + static constexpr bool is_defined = !std::is_same_v; +}; + +namespace hyperparam_detail +{ + +namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; + +using vsu_ns::ContigHyperparameterSetDefault; +using vsu_ns::UnaryContigHyperparameterSetEntry; + +template +struct Log1pContigHyperparameterSet +{ + using value_type = + typename std::disjunction>; + + constexpr static auto vec_sz = value_type::vec_sz; + constexpr static auto n_vecs = value_type::n_vecs; +}; + +} // end of namespace hyperparam_detail + +template +class log1p_contig_kernel; + +template +sycl::event log1p_contig_impl(sycl::queue &exec_q, + std::size_t nelems, + const char *arg_p, + char *res_p, + const std::vector &depends = {}) +{ + using Log1pHS = hyperparam_detail::Log1pContigHyperparameterSet; + static constexpr std::uint8_t vec_sz = Log1pHS::vec_sz; + static constexpr std::uint8_t n_vecs = Log1pHS::n_vecs; + + return elementwise_common::unary_contig_impl< + argTy, Log1pOutputType, Log1pContigFunctor, log1p_contig_kernel, vec_sz, + n_vecs>(exec_q, nelems, arg_p, res_p, depends); +} + +template +struct Log1pContigFactory +{ + fnT get() + { + if constexpr (!Log1pOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = log1p_contig_impl; + return fn; + } + } +}; + +template +struct Log1pTypeMapFactory +{ + /*! @brief get typeid for output type of sycl::log1p(T x) */ + std::enable_if_t::value, int> get() + { + using rT = typename Log1pOutputType::value_type; + return td_ns::GetTypeid{}.get(); + } +}; + +template +class log1p_strided_kernel; + +template +sycl::event + log1p_strided_impl(sycl::queue &exec_q, + std::size_t nelems, + int nd, + const ssize_t *shape_and_strides, + const char *arg_p, + ssize_t arg_offset, + char *res_p, + ssize_t res_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + return elementwise_common::unary_strided_impl< + argTy, Log1pOutputType, Log1pStridedFunctor, log1p_strided_kernel>( + exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p, + res_offset, depends, additional_depends); +} + +template +struct Log1pStridedFactory +{ + fnT get() + { + if constexpr (!Log1pOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = log1p_strided_impl; + return fn; + } + } +}; + +} // namespace dpctl::tensor::kernels::log1p diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/log2.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/log2.hpp new file mode 100644 index 000000000000..3cb537b82522 --- /dev/null +++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/log2.hpp @@ -0,0 +1,241 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for elementwise evaluation of LOG2(x) function. +//===---------------------------------------------------------------------===// + +#pragma once +#include +#include +#include +#include +#include +#include + +#include "sycl_complex.hpp" +#include "vec_size_util.hpp" + +#include "kernels/dpctl_tensor_types.hpp" +#include "kernels/elementwise_functions/common.hpp" + +#include "utils/type_dispatch_building.hpp" +#include "utils/type_utils.hpp" + +namespace dpctl::tensor::kernels::log2 +{ + +using dpctl::tensor::ssize_t; +namespace td_ns = dpctl::tensor::type_dispatch; + +using dpctl::tensor::type_utils::is_complex; +using dpctl::tensor::type_utils::vec_cast; + +template +struct Log2Functor +{ + + // is function constant for given argT + using is_constant = typename std::false_type; + // constant value, if constant + // constexpr resT constant_value = resT{}; + // is function defined for sycl::vec + using supports_vec = typename std::negation< + std::disjunction, is_complex>>; + // do both argTy and resTy support sugroup store/load operation + using supports_sg_loadstore = typename std::negation< + std::disjunction, is_complex>>; + + resT operator()(const argT &in) const + { + if constexpr (is_complex::value) { + using realT = typename argT::value_type; + + // log(in) / log(realT{2}); + return exprm_ns::log(exprm_ns::complex(in)) / + sycl::log(realT{2}); + } + else { + return sycl::log2(in); + } + } + + template + sycl::vec operator()(const sycl::vec &in) const + { + auto const &res_vec = sycl::log2(in); + using deducedT = typename std::remove_cv_t< + std::remove_reference_t>::element_type; + if constexpr (std::is_same_v) { + return res_vec; + } + else { + return vec_cast(res_vec); + } + } +}; + +template +using Log2ContigFunctor = + elementwise_common::UnaryContigFunctor, + vec_sz, + n_vecs, + enable_sg_loadstore>; + +template +using Log2StridedFunctor = elementwise_common:: + UnaryStridedFunctor>; + +template +struct Log2OutputType +{ + using value_type = typename std::disjunction< + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, std::complex>, + td_ns:: + TypeMapResultEntry, std::complex>, + td_ns::DefaultResultEntry>::result_type; + + static constexpr bool is_defined = !std::is_same_v; +}; + +namespace hyperparam_detail +{ + +namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; + +using vsu_ns::ContigHyperparameterSetDefault; +using vsu_ns::UnaryContigHyperparameterSetEntry; + +template +struct Log2ContigHyperparameterSet +{ + using value_type = + typename std::disjunction>; + + constexpr static auto vec_sz = value_type::vec_sz; + constexpr static auto n_vecs = value_type::n_vecs; +}; + +} // end of namespace hyperparam_detail + +template +class log2_contig_kernel; + +template +sycl::event log2_contig_impl(sycl::queue &exec_q, + std::size_t nelems, + const char *arg_p, + char *res_p, + const std::vector &depends = {}) +{ + using Log2HS = hyperparam_detail::Log2ContigHyperparameterSet; + static constexpr std::uint8_t vec_sz = Log2HS::vec_sz; + static constexpr std::uint8_t n_vecs = Log2HS::n_vecs; + + return elementwise_common::unary_contig_impl< + argTy, Log2OutputType, Log2ContigFunctor, log2_contig_kernel, vec_sz, + n_vecs>(exec_q, nelems, arg_p, res_p, depends); +} + +template +struct Log2ContigFactory +{ + fnT get() + { + if constexpr (!Log2OutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = log2_contig_impl; + return fn; + } + } +}; + +template +struct Log2TypeMapFactory +{ + /*! @brief get typeid for output type of sycl::log2(T x) */ + std::enable_if_t::value, int> get() + { + using rT = typename Log2OutputType::value_type; + return td_ns::GetTypeid{}.get(); + } +}; + +template +class log2_strided_kernel; + +template +sycl::event + log2_strided_impl(sycl::queue &exec_q, + std::size_t nelems, + int nd, + const ssize_t *shape_and_strides, + const char *arg_p, + ssize_t arg_offset, + char *res_p, + ssize_t res_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + return elementwise_common::unary_strided_impl< + argTy, Log2OutputType, Log2StridedFunctor, log2_strided_kernel>( + exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p, + res_offset, depends, additional_depends); +} + +template +struct Log2StridedFactory +{ + fnT get() + { + if constexpr (!Log2OutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = log2_strided_impl; + return fn; + } + } +}; + +} // namespace dpctl::tensor::kernels::log2 diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp new file mode 100644 index 000000000000..3a79950672d2 --- /dev/null +++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp @@ -0,0 +1,263 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// \file +/// This file defines kernels for elementwise evaluation of LOGADDEXP(x1, x2) +/// function. +//===---------------------------------------------------------------------===// + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include + +#include "common.hpp" +#include "vec_size_util.hpp" + +#include "utils/math_utils.hpp" +#include "utils/type_dispatch_building.hpp" + +#include "kernels/dpctl_tensor_types.hpp" + +namespace dpctl::tensor::kernels::logaddexp +{ +using dpctl::tensor::ssize_t; +namespace td_ns = dpctl::tensor::type_dispatch; + +template +struct LogAddExpFunctor +{ + using supports_sg_loadstore = std::true_type; + using supports_vec = std::true_type; + + resT operator()(const argT1 &in1, const argT2 &in2) const + { + using dpctl::tensor::math_utils::logaddexp; + return logaddexp(in1, in2); + } + + template + sycl::vec + operator()(const sycl::vec &in1, + const sycl::vec &in2) const + { + sycl::vec res; + auto diff = in1 - in2; // take advantange of faster vec arithmetic + +#pragma unroll + for (int i = 0; i < vec_sz; ++i) { + if (std::isfinite(diff[i])) { + res[i] = std::max(in1[i], in2[i]) + + impl_finite(-sycl::fabs(diff[i])); + } + else { + using dpctl::tensor::math_utils::logaddexp; + res[i] = logaddexp(in1[i], in2[i]); + } + } + + return res; + } + +private: + template + T impl_finite(T const &in) const + { + return (in > 0) ? (in + sycl::log1p(sycl::exp(-in))) + : sycl::log1p(sycl::exp(in)); + } +}; + +template +using LogAddExpContigFunctor = elementwise_common::BinaryContigFunctor< + argT1, + argT2, + resT, + LogAddExpFunctor, + vec_sz, + n_vecs, + enable_sg_loadstore>; + +template +using LogAddExpStridedFunctor = elementwise_common::BinaryStridedFunctor< + argT1, + argT2, + resT, + IndexerT, + LogAddExpFunctor>; + +template +struct LogAddExpOutputType +{ + using value_type = typename std::disjunction< + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::DefaultResultEntry>::result_type; + + static constexpr bool is_defined = !std::is_same_v; +}; + +namespace hyperparam_detail +{ + +namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; + +using vsu_ns::BinaryContigHyperparameterSetEntry; +using vsu_ns::ContigHyperparameterSetDefault; + +template +struct LogAddExpContigHyperparameterSet +{ + using value_type = + typename std::disjunction>; + + constexpr static auto vec_sz = value_type::vec_sz; + constexpr static auto n_vecs = value_type::n_vecs; +}; + +} // end of namespace hyperparam_detail + +template +class logaddexp_contig_kernel; + +template +sycl::event logaddexp_contig_impl(sycl::queue &exec_q, + std::size_t nelems, + const char *arg1_p, + ssize_t arg1_offset, + const char *arg2_p, + ssize_t arg2_offset, + char *res_p, + ssize_t res_offset, + const std::vector &depends = {}) +{ + using LogAddExpHS = + hyperparam_detail::LogAddExpContigHyperparameterSet; + static constexpr std::uint8_t vec_sz = LogAddExpHS::vec_sz; + static constexpr std::uint8_t n_vecs = LogAddExpHS::n_vecs; + + return elementwise_common::binary_contig_impl< + argTy1, argTy2, LogAddExpOutputType, LogAddExpContigFunctor, + logaddexp_contig_kernel, vec_sz, n_vecs>( + exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p, + res_offset, depends); +} + +template +struct LogAddExpContigFactory +{ + fnT get() + { + if constexpr (!LogAddExpOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = logaddexp_contig_impl; + return fn; + } + } +}; + +template +struct LogAddExpTypeMapFactory +{ + /*! @brief get typeid for output type of logaddexp(T1 x, T2 y) */ + std::enable_if_t::value, int> get() + { + using rT = typename LogAddExpOutputType::value_type; + return td_ns::GetTypeid{}.get(); + } +}; + +template +class logaddexp_strided_kernel; + +template +sycl::event + logaddexp_strided_impl(sycl::queue &exec_q, + std::size_t nelems, + int nd, + const ssize_t *shape_and_strides, + const char *arg1_p, + ssize_t arg1_offset, + const char *arg2_p, + ssize_t arg2_offset, + char *res_p, + ssize_t res_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + return elementwise_common::binary_strided_impl< + argTy1, argTy2, LogAddExpOutputType, LogAddExpStridedFunctor, + logaddexp_strided_kernel>(exec_q, nelems, nd, shape_and_strides, arg1_p, + arg1_offset, arg2_p, arg2_offset, res_p, + res_offset, depends, additional_depends); +} + +template +struct LogAddExpStridedFactory +{ + fnT get() + { + if constexpr (!LogAddExpOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = logaddexp_strided_impl; + return fn; + } + } +}; + +template +class logaddexp_matrix_row_broadcast_sg_krn; + +} // namespace dpctl::tensor::kernels::logaddexp diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/logical_and.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/logical_and.hpp new file mode 100644 index 000000000000..39049dab8d5e --- /dev/null +++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/logical_and.hpp @@ -0,0 +1,291 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for elementwise evaluation of LOGICAL_AND(x1, x2) +/// function. +//===---------------------------------------------------------------------===// + +#pragma once +#include +#include +#include +#include +#include + +#include + +#include "vec_size_util.hpp" + +#include "utils/offset_utils.hpp" +#include "utils/type_dispatch_building.hpp" +#include "utils/type_utils.hpp" + +#include "kernels/dpctl_tensor_types.hpp" +#include "kernels/elementwise_functions/common.hpp" + +namespace dpctl::tensor::kernels::logical_and +{ + +using dpctl::tensor::ssize_t; +namespace td_ns = dpctl::tensor::type_dispatch; +namespace tu_ns = dpctl::tensor::type_utils; + +template +struct LogicalAndFunctor +{ + static_assert(std::is_same_v); + + using supports_sg_loadstore = std::negation< + std::disjunction, tu_ns::is_complex>>; + using supports_vec = std::conjunction< + std::is_same, + std::negation, + tu_ns::is_complex>>>; + + resT operator()(const argT1 &in1, const argT2 &in2) const + { + using tu_ns::convert_impl; + + return (convert_impl(in1) && + convert_impl(in2)); + } + + template + sycl::vec + operator()(const sycl::vec &in1, + const sycl::vec &in2) const + { + + auto tmp = (in1 && in2); + if constexpr (std::is_same_v) { + return tmp; + } + else { + using dpctl::tensor::type_utils::vec_cast; + + return vec_cast( + tmp); + } + } +}; + +template +using LogicalAndContigFunctor = elementwise_common::BinaryContigFunctor< + argT1, + argT2, + resT, + LogicalAndFunctor, + vec_sz, + n_vecs, + enable_sg_loadstore>; + +template +using LogicalAndStridedFunctor = elementwise_common::BinaryStridedFunctor< + argT1, + argT2, + resT, + IndexerT, + LogicalAndFunctor>; + +template +struct LogicalAndOutputType +{ + using value_type = typename std::disjunction< + td_ns::BinaryTypeMapResultEntry, + td_ns:: + BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns:: + BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns:: + BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns:: + BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + T2, + std::complex, + bool>, + td_ns::BinaryTypeMapResultEntry, + T2, + std::complex, + bool>, + td_ns::DefaultResultEntry>::result_type; + + static constexpr bool is_defined = !std::is_same_v; +}; + +namespace hyperparam_detail +{ + +namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; + +using vsu_ns::BinaryContigHyperparameterSetEntry; +using vsu_ns::ContigHyperparameterSetDefault; + +template +struct LogicalAndContigHyperparameterSet +{ + using value_type = + typename std::disjunction>; + + constexpr static auto vec_sz = value_type::vec_sz; + constexpr static auto n_vecs = value_type::n_vecs; +}; + +} // end of namespace hyperparam_detail + +template +class logical_and_contig_kernel; + +template +sycl::event + logical_and_contig_impl(sycl::queue &exec_q, + std::size_t nelems, + const char *arg1_p, + ssize_t arg1_offset, + const char *arg2_p, + ssize_t arg2_offset, + char *res_p, + ssize_t res_offset, + const std::vector &depends = {}) +{ + using LogicalAndHS = + hyperparam_detail::LogicalAndContigHyperparameterSet; + static constexpr std::uint8_t vec_sz = LogicalAndHS::vec_sz; + static constexpr std::uint8_t n_vecs = LogicalAndHS::n_vecs; + + return elementwise_common::binary_contig_impl< + argTy1, argTy2, LogicalAndOutputType, LogicalAndContigFunctor, + logical_and_contig_kernel, vec_sz, n_vecs>( + exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p, + res_offset, depends); +} + +template +struct LogicalAndContigFactory +{ + fnT get() + { + if constexpr (!LogicalAndOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = logical_and_contig_impl; + return fn; + } + } +}; + +template +struct LogicalAndTypeMapFactory +{ + /*! @brief get typeid for output type of operator()>(x, y), always bool + */ + std::enable_if_t::value, int> get() + { + using rT = typename LogicalAndOutputType::value_type; + return td_ns::GetTypeid{}.get(); + } +}; + +template +class logical_and_strided_kernel; + +template +sycl::event + logical_and_strided_impl(sycl::queue &exec_q, + std::size_t nelems, + int nd, + const ssize_t *shape_and_strides, + const char *arg1_p, + ssize_t arg1_offset, + const char *arg2_p, + ssize_t arg2_offset, + char *res_p, + ssize_t res_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + return elementwise_common::binary_strided_impl< + argTy1, argTy2, LogicalAndOutputType, LogicalAndStridedFunctor, + logical_and_strided_kernel>( + exec_q, nelems, nd, shape_and_strides, arg1_p, arg1_offset, arg2_p, + arg2_offset, res_p, res_offset, depends, additional_depends); +} + +template +struct LogicalAndStridedFactory +{ + fnT get() + { + if constexpr (!LogicalAndOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = logical_and_strided_impl; + return fn; + } + } +}; + +} // namespace dpctl::tensor::kernels::logical_and diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/logical_not.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/logical_not.hpp new file mode 100644 index 000000000000..b8f1c042ca73 --- /dev/null +++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/logical_not.hpp @@ -0,0 +1,199 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for elementwise evaluation of LOGICAL_NOT(x) +/// function. +//===---------------------------------------------------------------------===// + +#pragma once +#include +#include +#include +#include + +#include + +#include "vec_size_util.hpp" + +#include "kernels/dpctl_tensor_types.hpp" +#include "kernels/elementwise_functions/common.hpp" + +#include "utils/type_dispatch_building.hpp" +#include "utils/type_utils.hpp" + +namespace dpctl::tensor::kernels::logical_not +{ + +using dpctl::tensor::ssize_t; +namespace td_ns = dpctl::tensor::type_dispatch; +namespace tu_ns = dpctl::tensor::type_utils; + +template +struct LogicalNotFunctor +{ + static_assert(std::is_same_v); + + using is_constant = typename std::false_type; + // constexpr resT constant_value = resT{}; + using supports_vec = typename std::false_type; + using supports_sg_loadstore = typename std::negation< + std::disjunction, tu_ns::is_complex>>; + + resT operator()(const argT &in) const + { + using tu_ns::convert_impl; + return !convert_impl(in); + } +}; + +template +using LogicalNotContigFunctor = + elementwise_common::UnaryContigFunctor, + vec_sz, + n_vecs, + enable_sg_loadstore>; + +template +using LogicalNotStridedFunctor = + elementwise_common::UnaryStridedFunctor>; + +template +struct LogicalNotOutputType +{ + using value_type = bool; +}; + +namespace hyperparam_detail +{ + +namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; + +using vsu_ns::ContigHyperparameterSetDefault; +using vsu_ns::UnaryContigHyperparameterSetEntry; + +template +struct LogicalNotContigHyperparameterSet +{ + using value_type = + typename std::disjunction>; + + constexpr static auto vec_sz = value_type::vec_sz; + constexpr static auto n_vecs = value_type::n_vecs; +}; + +} // end of namespace hyperparam_detail + +template +class logical_not_contig_kernel; + +template +sycl::event + logical_not_contig_impl(sycl::queue &exec_q, + std::size_t nelems, + const char *arg_p, + char *res_p, + const std::vector &depends = {}) +{ + using LogicalNotHS = + hyperparam_detail::LogicalNotContigHyperparameterSet; + static constexpr std::uint8_t vec_sz = LogicalNotHS::vec_sz; + static constexpr std::uint8_t n_vecs = LogicalNotHS::n_vecs; + + return elementwise_common::unary_contig_impl< + argTy, LogicalNotOutputType, LogicalNotContigFunctor, + logical_not_contig_kernel, vec_sz, n_vecs>(exec_q, nelems, arg_p, res_p, + depends); +} + +template +struct LogicalNotContigFactory +{ + fnT get() + { + fnT fn = logical_not_contig_impl; + return fn; + } +}; + +template +struct LogicalNotTypeMapFactory +{ + /*! @brief get typeid for output type of sycl::logical_not(T x) */ + std::enable_if_t::value, int> get() + { + using rT = typename LogicalNotOutputType::value_type; + return td_ns::GetTypeid{}.get(); + } +}; + +template +class logical_not_strided_kernel; + +template +sycl::event + logical_not_strided_impl(sycl::queue &exec_q, + std::size_t nelems, + int nd, + const ssize_t *shape_and_strides, + const char *arg_p, + ssize_t arg_offset, + char *res_p, + ssize_t res_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + return elementwise_common::unary_strided_impl( + exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p, + res_offset, depends, additional_depends); +} + +template +struct LogicalNotStridedFactory +{ + fnT get() + { + fnT fn = logical_not_strided_impl; + return fn; + } +}; + +} // namespace dpctl::tensor::kernels::logical_not diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/logical_or.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/logical_or.hpp new file mode 100644 index 000000000000..637e7681e7c0 --- /dev/null +++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/logical_or.hpp @@ -0,0 +1,290 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for elementwise evaluation of LOGICAL_OR(x1, x2) +/// function. +//===---------------------------------------------------------------------===// + +#pragma once +#include +#include +#include +#include +#include + +#include + +#include "vec_size_util.hpp" + +#include "utils/offset_utils.hpp" +#include "utils/type_dispatch_building.hpp" +#include "utils/type_utils.hpp" + +#include "kernels/dpctl_tensor_types.hpp" +#include "kernels/elementwise_functions/common.hpp" + +namespace dpctl::tensor::kernels::logical_or +{ + +using dpctl::tensor::ssize_t; +namespace td_ns = dpctl::tensor::type_dispatch; +namespace tu_ns = dpctl::tensor::type_utils; + +template +struct LogicalOrFunctor +{ + static_assert(std::is_same_v); + + using supports_sg_loadstore = std::negation< + std::disjunction, tu_ns::is_complex>>; + using supports_vec = std::conjunction< + std::is_same, + std::negation, + tu_ns::is_complex>>>; + + resT operator()(const argT1 &in1, const argT2 &in2) const + { + using tu_ns::convert_impl; + + return (convert_impl(in1) || + convert_impl(in2)); + } + + template + sycl::vec + operator()(const sycl::vec &in1, + const sycl::vec &in2) const + { + + auto tmp = (in1 || in2); + if constexpr (std::is_same_v) { + return tmp; + } + else { + using dpctl::tensor::type_utils::vec_cast; + + return vec_cast( + tmp); + } + } +}; + +template +using LogicalOrContigFunctor = elementwise_common::BinaryContigFunctor< + argT1, + argT2, + resT, + LogicalOrFunctor, + vec_sz, + n_vecs, + enable_sg_loadstore>; + +template +using LogicalOrStridedFunctor = elementwise_common::BinaryStridedFunctor< + argT1, + argT2, + resT, + IndexerT, + LogicalOrFunctor>; + +template +struct LogicalOrOutputType +{ + using value_type = typename std::disjunction< + td_ns::BinaryTypeMapResultEntry, + td_ns:: + BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns:: + BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns:: + BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns:: + BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + T2, + std::complex, + bool>, + td_ns::BinaryTypeMapResultEntry, + T2, + std::complex, + bool>, + td_ns::DefaultResultEntry>::result_type; + + static constexpr bool is_defined = !std::is_same_v; +}; + +namespace hyperparam_detail +{ + +namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; + +using vsu_ns::BinaryContigHyperparameterSetEntry; +using vsu_ns::ContigHyperparameterSetDefault; + +template +struct LogicalOrContigHyperparameterSet +{ + using value_type = + typename std::disjunction>; + + constexpr static auto vec_sz = value_type::vec_sz; + constexpr static auto n_vecs = value_type::n_vecs; +}; + +} // end of namespace hyperparam_detail + +template +class logical_or_contig_kernel; + +template +sycl::event logical_or_contig_impl(sycl::queue &exec_q, + std::size_t nelems, + const char *arg1_p, + ssize_t arg1_offset, + const char *arg2_p, + ssize_t arg2_offset, + char *res_p, + ssize_t res_offset, + const std::vector &depends = {}) +{ + using LogicalOrHS = + hyperparam_detail::LogicalOrContigHyperparameterSet; + static constexpr std::uint8_t vec_sz = LogicalOrHS::vec_sz; + static constexpr std::uint8_t n_vecs = LogicalOrHS::n_vecs; + + return elementwise_common::binary_contig_impl< + argTy1, argTy2, LogicalOrOutputType, LogicalOrContigFunctor, + logical_or_contig_kernel, vec_sz, n_vecs>( + exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p, + res_offset, depends); +} + +template +struct LogicalOrContigFactory +{ + fnT get() + { + if constexpr (!LogicalOrOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = logical_or_contig_impl; + return fn; + } + } +}; + +template +struct LogicalOrTypeMapFactory +{ + /*! @brief get typeid for output type of operator()>(x, y), always bool + */ + std::enable_if_t::value, int> get() + { + using rT = typename LogicalOrOutputType::value_type; + return td_ns::GetTypeid{}.get(); + } +}; + +template +class logical_or_strided_kernel; + +template +sycl::event + logical_or_strided_impl(sycl::queue &exec_q, + std::size_t nelems, + int nd, + const ssize_t *shape_and_strides, + const char *arg1_p, + ssize_t arg1_offset, + const char *arg2_p, + ssize_t arg2_offset, + char *res_p, + ssize_t res_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + return elementwise_common::binary_strided_impl< + argTy1, argTy2, LogicalOrOutputType, LogicalOrStridedFunctor, + logical_or_strided_kernel>( + exec_q, nelems, nd, shape_and_strides, arg1_p, arg1_offset, arg2_p, + arg2_offset, res_p, res_offset, depends, additional_depends); +} + +template +struct LogicalOrStridedFactory +{ + fnT get() + { + if constexpr (!LogicalOrOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = logical_or_strided_impl; + return fn; + } + } +}; + +} // namespace dpctl::tensor::kernels::logical_or diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/logical_xor.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/logical_xor.hpp new file mode 100644 index 000000000000..698e4d9ab5c1 --- /dev/null +++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/logical_xor.hpp @@ -0,0 +1,292 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for elementwise evaluation of LOGICAL_XOR(x1, x2) +/// function. +//===---------------------------------------------------------------------===// + +#pragma once +#include +#include +#include +#include +#include + +#include + +#include "vec_size_util.hpp" + +#include "utils/offset_utils.hpp" +#include "utils/type_dispatch_building.hpp" +#include "utils/type_utils.hpp" + +#include "kernels/dpctl_tensor_types.hpp" +#include "kernels/elementwise_functions/common.hpp" + +namespace dpctl::tensor::kernels::logical_xor +{ + +using dpctl::tensor::ssize_t; +namespace td_ns = dpctl::tensor::type_dispatch; +namespace tu_ns = dpctl::tensor::type_utils; + +template +struct LogicalXorFunctor +{ + static_assert(std::is_same_v); + + using supports_sg_loadstore = std::negation< + std::disjunction, tu_ns::is_complex>>; + using supports_vec = std::conjunction< + std::is_same, + std::negation, + tu_ns::is_complex>>>; + + resT operator()(const argT1 &in1, const argT2 &in2) const + { + using tu_ns::convert_impl; + + return (convert_impl(in1) != + convert_impl(in2)); + } + + template + sycl::vec + operator()(const sycl::vec &in1, + const sycl::vec &in2) const + { + using tu_ns::vec_cast; + auto tmp1 = vec_cast(in1); + auto tmp2 = vec_cast(in2); + + auto tmp = (tmp1 != tmp2); + if constexpr (std::is_same_v) { + return tmp; + } + else { + return vec_cast( + tmp); + } + } +}; + +template +using LogicalXorContigFunctor = elementwise_common::BinaryContigFunctor< + argT1, + argT2, + resT, + LogicalXorFunctor, + vec_sz, + n_vecs, + enable_sg_loadstore>; + +template +using LogicalXorStridedFunctor = elementwise_common::BinaryStridedFunctor< + argT1, + argT2, + resT, + IndexerT, + LogicalXorFunctor>; + +template +struct LogicalXorOutputType +{ + using value_type = typename std::disjunction< + td_ns::BinaryTypeMapResultEntry, + td_ns:: + BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns:: + BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns:: + BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns:: + BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + T2, + std::complex, + bool>, + td_ns::BinaryTypeMapResultEntry, + T2, + std::complex, + bool>, + td_ns::DefaultResultEntry>::result_type; + + static constexpr bool is_defined = !std::is_same_v; +}; + +namespace hyperparam_detail +{ + +namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; + +using vsu_ns::BinaryContigHyperparameterSetEntry; +using vsu_ns::ContigHyperparameterSetDefault; + +template +struct LogicalXorContigHyperparameterSet +{ + using value_type = + typename std::disjunction>; + + constexpr static auto vec_sz = value_type::vec_sz; + constexpr static auto n_vecs = value_type::n_vecs; +}; + +} // end of namespace hyperparam_detail + +template +class logical_xor_contig_kernel; + +template +sycl::event + logical_xor_contig_impl(sycl::queue &exec_q, + std::size_t nelems, + const char *arg1_p, + ssize_t arg1_offset, + const char *arg2_p, + ssize_t arg2_offset, + char *res_p, + ssize_t res_offset, + const std::vector &depends = {}) +{ + using LogicalXorHS = + hyperparam_detail::LogicalXorContigHyperparameterSet; + static constexpr std::uint8_t vec_sz = LogicalXorHS::vec_sz; + static constexpr std::uint8_t n_vecs = LogicalXorHS::n_vecs; + + return elementwise_common::binary_contig_impl< + argTy1, argTy2, LogicalXorOutputType, LogicalXorContigFunctor, + logical_xor_contig_kernel, vec_sz, n_vecs>( + exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p, + res_offset, depends); +} + +template +struct LogicalXorContigFactory +{ + fnT get() + { + if constexpr (!LogicalXorOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = logical_xor_contig_impl; + return fn; + } + } +}; + +template +struct LogicalXorTypeMapFactory +{ + /*! @brief get typeid for output type of operator()>(x, y), always bool + */ + std::enable_if_t::value, int> get() + { + using rT = typename LogicalXorOutputType::value_type; + return td_ns::GetTypeid{}.get(); + } +}; + +template +class logical_xor_strided_kernel; + +template +sycl::event + logical_xor_strided_impl(sycl::queue &exec_q, + std::size_t nelems, + int nd, + const ssize_t *shape_and_strides, + const char *arg1_p, + ssize_t arg1_offset, + const char *arg2_p, + ssize_t arg2_offset, + char *res_p, + ssize_t res_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + return elementwise_common::binary_strided_impl< + argTy1, argTy2, LogicalXorOutputType, LogicalXorStridedFunctor, + logical_xor_strided_kernel>( + exec_q, nelems, nd, shape_and_strides, arg1_p, arg1_offset, arg2_p, + arg2_offset, res_p, res_offset, depends, additional_depends); +} + +template +struct LogicalXorStridedFactory +{ + fnT get() + { + if constexpr (!LogicalXorOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = logical_xor_strided_impl; + return fn; + } + } +}; + +} // namespace dpctl::tensor::kernels::logical_xor diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp new file mode 100644 index 000000000000..52494cceba93 --- /dev/null +++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp @@ -0,0 +1,321 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for elementwise evaluation of MAXIMUM(x1, x2) +/// function. +//===---------------------------------------------------------------------===// + +#pragma once + +#include +#include +#include +#include +#include + +#include + +#include "common.hpp" +#include "vec_size_util.hpp" + +#include "utils/math_utils.hpp" +#include "utils/type_dispatch_building.hpp" +#include "utils/type_utils.hpp" + +#include "kernels/dpctl_tensor_types.hpp" + +namespace dpctl::tensor::kernels::maximum +{ + +using dpctl::tensor::ssize_t; +namespace td_ns = dpctl::tensor::type_dispatch; +namespace tu_ns = dpctl::tensor::type_utils; + +template +struct MaximumFunctor +{ + + using supports_sg_loadstore = std::negation< + std::disjunction, tu_ns::is_complex>>; + using supports_vec = std::conjunction< + std::is_same, + std::negation, + tu_ns::is_complex>>>; + + resT operator()(const argT1 &in1, const argT2 &in2) const + { + if constexpr (tu_ns::is_complex::value || + tu_ns::is_complex::value) { + static_assert(std::is_same_v); + using dpctl::tensor::math_utils::max_complex; + return max_complex(in1, in2); + } + else if constexpr (std::is_floating_point_v || + std::is_same_v) { + const bool choose_first = (sycl::isnan(in1) || (in1 > in2)); + return (choose_first) ? in1 : in2; + } + else { + return (in1 > in2) ? in1 : in2; + } + } + + template + sycl::vec + operator()(const sycl::vec &in1, + const sycl::vec &in2) const + { + sycl::vec res; +#pragma unroll + for (int i = 0; i < vec_sz; ++i) { + const auto &v1 = in1[i]; + const auto &v2 = in2[i]; + if constexpr (std::is_floating_point_v || + std::is_same_v) { + const bool choose_first = (sycl::isnan(v1) || (v1 > v2)); + res[i] = (choose_first) ? v1 : v2; + } + else { + res[i] = (v1 > v2) ? v1 : v2; + } + } + return res; + } +}; + +template +using MaximumContigFunctor = + elementwise_common::BinaryContigFunctor, + vec_sz, + n_vecs, + enable_sg_loadstore>; + +template +using MaximumStridedFunctor = elementwise_common::BinaryStridedFunctor< + argT1, + argT2, + resT, + IndexerT, + MaximumFunctor>; + +template +struct MaximumOutputType +{ + using value_type = typename std::disjunction< + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + T2, + std::complex, + std::complex>, + td_ns::BinaryTypeMapResultEntry, + T2, + std::complex, + std::complex>, + td_ns::DefaultResultEntry>::result_type; + + static constexpr bool is_defined = !std::is_same_v; +}; + +namespace hyperparam_detail +{ + +namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; + +using vsu_ns::BinaryContigHyperparameterSetEntry; +using vsu_ns::ContigHyperparameterSetDefault; + +template +struct MaximumContigHyperparameterSet +{ + using value_type = + typename std::disjunction>; + + constexpr static auto vec_sz = value_type::vec_sz; + constexpr static auto n_vecs = value_type::n_vecs; +}; + +} // end of namespace hyperparam_detail + +template +class maximum_contig_kernel; + +template +sycl::event maximum_contig_impl(sycl::queue &exec_q, + std::size_t nelems, + const char *arg1_p, + ssize_t arg1_offset, + const char *arg2_p, + ssize_t arg2_offset, + char *res_p, + ssize_t res_offset, + const std::vector &depends = {}) +{ + using MaxHS = + hyperparam_detail::MaximumContigHyperparameterSet; + static constexpr std::uint8_t vec_sz = MaxHS::vec_sz; + static constexpr std::uint8_t n_vecs = MaxHS::n_vecs; + + return elementwise_common::binary_contig_impl< + argTy1, argTy2, MaximumOutputType, MaximumContigFunctor, + maximum_contig_kernel, vec_sz, n_vecs>(exec_q, nelems, arg1_p, + arg1_offset, arg2_p, arg2_offset, + res_p, res_offset, depends); +} + +template +struct MaximumContigFactory +{ + fnT get() + { + if constexpr (!MaximumOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = maximum_contig_impl; + return fn; + } + } +}; + +template +struct MaximumTypeMapFactory +{ + /*! @brief get typeid for output type of maximum(T1 x, T2 y) */ + std::enable_if_t::value, int> get() + { + using rT = typename MaximumOutputType::value_type; + return td_ns::GetTypeid{}.get(); + } +}; + +template +class maximum_strided_kernel; + +template +sycl::event + maximum_strided_impl(sycl::queue &exec_q, + std::size_t nelems, + int nd, + const ssize_t *shape_and_strides, + const char *arg1_p, + ssize_t arg1_offset, + const char *arg2_p, + ssize_t arg2_offset, + char *res_p, + ssize_t res_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + return elementwise_common::binary_strided_impl< + argTy1, argTy2, MaximumOutputType, MaximumStridedFunctor, + maximum_strided_kernel>(exec_q, nelems, nd, shape_and_strides, arg1_p, + arg1_offset, arg2_p, arg2_offset, res_p, + res_offset, depends, additional_depends); +} + +template +struct MaximumStridedFactory +{ + fnT get() + { + if constexpr (!MaximumOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = maximum_strided_impl; + return fn; + } + } +}; +} // namespace dpctl::tensor::kernels::maximum diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp new file mode 100644 index 000000000000..c11961f8c5c0 --- /dev/null +++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp @@ -0,0 +1,321 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for elementwise evaluation of MINIMUM(x1, x2) +/// function. +//===---------------------------------------------------------------------===// + +#pragma once + +#include +#include +#include +#include +#include + +#include + +#include "common.hpp" +#include "vec_size_util.hpp" + +#include "utils/math_utils.hpp" +#include "utils/type_dispatch_building.hpp" +#include "utils/type_utils.hpp" + +#include "kernels/dpctl_tensor_types.hpp" + +namespace dpctl::tensor::kernels::minimum +{ + +using dpctl::tensor::ssize_t; +namespace td_ns = dpctl::tensor::type_dispatch; +namespace tu_ns = dpctl::tensor::type_utils; + +template +struct MinimumFunctor +{ + + using supports_sg_loadstore = std::negation< + std::disjunction, tu_ns::is_complex>>; + using supports_vec = std::conjunction< + std::is_same, + std::negation, + tu_ns::is_complex>>>; + + resT operator()(const argT1 &in1, const argT2 &in2) const + { + if constexpr (tu_ns::is_complex::value || + tu_ns::is_complex::value) { + static_assert(std::is_same_v); + using dpctl::tensor::math_utils::min_complex; + return min_complex(in1, in2); + } + else if constexpr (std::is_floating_point_v || + std::is_same_v) { + const bool choose_first = sycl::isnan(in1) || (in1 < in2); + return (choose_first) ? in1 : in2; + } + else { + return (in1 < in2) ? in1 : in2; + } + } + + template + sycl::vec + operator()(const sycl::vec &in1, + const sycl::vec &in2) const + { + sycl::vec res; +#pragma unroll + for (int i = 0; i < vec_sz; ++i) { + const auto &v1 = in1[i]; + const auto &v2 = in2[i]; + if constexpr (std::is_floating_point_v || + std::is_same_v) { + const bool choose_first = sycl::isnan(v1) || (v1 < v2); + res[i] = (choose_first) ? v1 : v2; + } + else { + res[i] = (v1 < v2) ? v1 : v2; + } + } + return res; + } +}; + +template +using MinimumContigFunctor = + elementwise_common::BinaryContigFunctor, + vec_sz, + n_vecs, + enable_sg_loadstore>; + +template +using MinimumStridedFunctor = elementwise_common::BinaryStridedFunctor< + argT1, + argT2, + resT, + IndexerT, + MinimumFunctor>; + +template +struct MinimumOutputType +{ + using value_type = typename std::disjunction< + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + T2, + std::complex, + std::complex>, + td_ns::BinaryTypeMapResultEntry, + T2, + std::complex, + std::complex>, + td_ns::DefaultResultEntry>::result_type; + + static constexpr bool is_defined = !std::is_same_v; +}; + +namespace hyperparam_detail +{ + +namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; + +using vsu_ns::BinaryContigHyperparameterSetEntry; +using vsu_ns::ContigHyperparameterSetDefault; + +template +struct MinimumContigHyperparameterSet +{ + using value_type = + typename std::disjunction>; + + constexpr static auto vec_sz = value_type::vec_sz; + constexpr static auto n_vecs = value_type::n_vecs; +}; + +} // end of namespace hyperparam_detail + +template +class minimum_contig_kernel; + +template +sycl::event minimum_contig_impl(sycl::queue &exec_q, + std::size_t nelems, + const char *arg1_p, + ssize_t arg1_offset, + const char *arg2_p, + ssize_t arg2_offset, + char *res_p, + ssize_t res_offset, + const std::vector &depends = {}) +{ + using MinHS = + hyperparam_detail::MinimumContigHyperparameterSet; + static constexpr std::uint8_t vec_sz = MinHS::vec_sz; + static constexpr std::uint8_t n_vecs = MinHS::n_vecs; + + return elementwise_common::binary_contig_impl< + argTy1, argTy2, MinimumOutputType, MinimumContigFunctor, + minimum_contig_kernel, vec_sz, n_vecs>(exec_q, nelems, arg1_p, + arg1_offset, arg2_p, arg2_offset, + res_p, res_offset, depends); +} + +template +struct MinimumContigFactory +{ + fnT get() + { + if constexpr (!MinimumOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = minimum_contig_impl; + return fn; + } + } +}; + +template +struct MinimumTypeMapFactory +{ + /*! @brief get typeid for output type of minimum(T1 x, T2 y) */ + std::enable_if_t::value, int> get() + { + using rT = typename MinimumOutputType::value_type; + return td_ns::GetTypeid{}.get(); + } +}; + +template +class minimum_strided_kernel; + +template +sycl::event + minimum_strided_impl(sycl::queue &exec_q, + std::size_t nelems, + int nd, + const ssize_t *shape_and_strides, + const char *arg1_p, + ssize_t arg1_offset, + const char *arg2_p, + ssize_t arg2_offset, + char *res_p, + ssize_t res_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + return elementwise_common::binary_strided_impl< + argTy1, argTy2, MinimumOutputType, MinimumStridedFunctor, + minimum_strided_kernel>(exec_q, nelems, nd, shape_and_strides, arg1_p, + arg1_offset, arg2_p, arg2_offset, res_p, + res_offset, depends, additional_depends); +} + +template +struct MinimumStridedFactory +{ + fnT get() + { + if constexpr (!MinimumOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = minimum_strided_impl; + return fn; + } + } +}; +} // namespace dpctl::tensor::kernels::minimum diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/multiply.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/multiply.hpp new file mode 100644 index 000000000000..58ff88b3afeb --- /dev/null +++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/multiply.hpp @@ -0,0 +1,641 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for elementwise evaluation of MUL(x1, x2) +/// function. +//===---------------------------------------------------------------------===// + +#pragma once +#include +#include +#include +#include +#include + +#include + +#include "sycl_complex.hpp" +#include "vec_size_util.hpp" + +#include "utils/offset_utils.hpp" +#include "utils/type_dispatch_building.hpp" +#include "utils/type_utils.hpp" + +#include "kernels/dpctl_tensor_types.hpp" +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/common_inplace.hpp" + +namespace dpctl::tensor::kernels::multiply +{ + +using dpctl::tensor::ssize_t; +namespace td_ns = dpctl::tensor::type_dispatch; +namespace tu_ns = dpctl::tensor::type_utils; + +template +struct MultiplyFunctor +{ + + using supports_sg_loadstore = std::negation< + std::disjunction, tu_ns::is_complex>>; + using supports_vec = std::negation< + std::disjunction, tu_ns::is_complex>>; + + resT operator()(const argT1 &in1, const argT2 &in2) const + { + if constexpr (tu_ns::is_complex::value && + tu_ns::is_complex::value) { + using realT1 = typename argT1::value_type; + using realT2 = typename argT2::value_type; + + return exprm_ns::complex(in1) * + exprm_ns::complex(in2); + } + else { + return in1 * in2; + } + } + + template + sycl::vec + operator()(const sycl::vec &in1, + const sycl::vec &in2) const + { + auto tmp = in1 * in2; + if constexpr (std::is_same_v) { + return tmp; + } + else { + using dpctl::tensor::type_utils::vec_cast; + + return vec_cast( + tmp); + } + } +}; + +template +using MultiplyContigFunctor = + elementwise_common::BinaryContigFunctor, + vec_sz, + n_vecs, + enable_sg_loadstore>; + +template +using MultiplyStridedFunctor = elementwise_common::BinaryStridedFunctor< + argT1, + argT2, + resT, + IndexerT, + MultiplyFunctor>; + +template +struct MultiplyOutputType +{ + using value_type = typename std::disjunction< + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + T2, + std::complex, + std::complex>, + td_ns::BinaryTypeMapResultEntry, + T2, + std::complex, + std::complex>, + td_ns::DefaultResultEntry>::result_type; + + static constexpr bool is_defined = !std::is_same_v; +}; + +namespace hyperparam_detail +{ + +namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; + +using vsu_ns::BinaryContigHyperparameterSetEntry; +using vsu_ns::ContigHyperparameterSetDefault; + +template +struct MultiplyContigHyperparameterSet +{ + using value_type = + typename std::disjunction>; + + constexpr static auto vec_sz = value_type::vec_sz; + constexpr static auto n_vecs = value_type::n_vecs; +}; + +} // end of namespace hyperparam_detail + +template +class multiply_contig_kernel; + +template +sycl::event multiply_contig_impl(sycl::queue &exec_q, + std::size_t nelems, + const char *arg1_p, + ssize_t arg1_offset, + const char *arg2_p, + ssize_t arg2_offset, + char *res_p, + ssize_t res_offset, + const std::vector &depends = {}) +{ + using MulHS = + hyperparam_detail::MultiplyContigHyperparameterSet; + static constexpr std::uint8_t vec_sz = MulHS::vec_sz; + static constexpr std::uint8_t n_vecs = MulHS::n_vecs; + + return elementwise_common::binary_contig_impl< + argTy1, argTy2, MultiplyOutputType, MultiplyContigFunctor, + multiply_contig_kernel, vec_sz, n_vecs>( + exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p, + res_offset, depends); +} + +template +struct MultiplyContigFactory +{ + fnT get() + { + if constexpr (!MultiplyOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = multiply_contig_impl; + return fn; + } + } +}; + +template +struct MultiplyTypeMapFactory +{ + /*! @brief get typeid for output type of multiply(T1 x, T2 y) */ + std::enable_if_t::value, int> get() + { + using rT = typename MultiplyOutputType::value_type; + return td_ns::GetTypeid{}.get(); + } +}; + +template +class multiply_strided_kernel; + +template +sycl::event + multiply_strided_impl(sycl::queue &exec_q, + std::size_t nelems, + int nd, + const ssize_t *shape_and_strides, + const char *arg1_p, + ssize_t arg1_offset, + const char *arg2_p, + ssize_t arg2_offset, + char *res_p, + ssize_t res_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + return elementwise_common::binary_strided_impl< + argTy1, argTy2, MultiplyOutputType, MultiplyStridedFunctor, + multiply_strided_kernel>(exec_q, nelems, nd, shape_and_strides, arg1_p, + arg1_offset, arg2_p, arg2_offset, res_p, + res_offset, depends, additional_depends); +} + +template +struct MultiplyStridedFactory +{ + fnT get() + { + if constexpr (!MultiplyOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = multiply_strided_impl; + return fn; + } + } +}; + +template +class multiply_matrix_row_broadcast_sg_krn; + +template +using MultiplyContigMatrixContigRowBroadcastingFunctor = + elementwise_common::BinaryContigMatrixContigRowBroadcastingFunctor< + argT1, + argT2, + resT, + MultiplyFunctor>; + +template +sycl::event multiply_contig_matrix_contig_row_broadcast_impl( + sycl::queue &exec_q, + std::vector &host_tasks, + std::size_t n0, + std::size_t n1, + const char *mat_p, // typeless pointer to (n0, n1) C-contiguous matrix + ssize_t mat_offset, + const char *vec_p, // typeless pointer to (n1,) contiguous row + ssize_t vec_offset, + char *res_p, // typeless pointer to (n0, n1) result C-contig. matrix, + // res[i,j] = mat[i,j] * vec[j] + ssize_t res_offset, + const std::vector &depends = {}) +{ + return elementwise_common::binary_contig_matrix_contig_row_broadcast_impl< + argT1, argT2, resT, MultiplyContigMatrixContigRowBroadcastingFunctor, + multiply_matrix_row_broadcast_sg_krn>(exec_q, host_tasks, n0, n1, mat_p, + mat_offset, vec_p, vec_offset, + res_p, res_offset, depends); +} + +template +struct MultiplyContigMatrixContigRowBroadcastFactory +{ + fnT get() + { + if constexpr (!MultiplyOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + using resT = typename MultiplyOutputType::value_type; + if constexpr (dpctl::tensor::type_utils::is_complex::value || + dpctl::tensor::type_utils::is_complex::value || + dpctl::tensor::type_utils::is_complex::value) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = + multiply_contig_matrix_contig_row_broadcast_impl; + return fn; + } + } + } +}; + +template +sycl::event multiply_contig_row_contig_matrix_broadcast_impl( + sycl::queue &exec_q, + std::vector &host_tasks, + std::size_t n0, + std::size_t n1, + const char *vec_p, // typeless pointer to (n1,) contiguous row + ssize_t vec_offset, + const char *mat_p, // typeless pointer to (n0, n1) C-contiguous matrix + ssize_t mat_offset, + char *res_p, // typeless pointer to (n0, n1) result C-contig. matrix, + // res[i,j] = mat[i,j] * vec[j] + ssize_t res_offset, + const std::vector &depends = {}) +{ + return multiply_contig_matrix_contig_row_broadcast_impl( + exec_q, host_tasks, n0, n1, mat_p, mat_offset, vec_p, vec_offset, res_p, + res_offset, depends); +}; + +template +struct MultiplyContigRowContigMatrixBroadcastFactory +{ + fnT get() + { + if constexpr (!MultiplyOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + using resT = typename MultiplyOutputType::value_type; + if constexpr (dpctl::tensor::type_utils::is_complex::value || + dpctl::tensor::type_utils::is_complex::value || + dpctl::tensor::type_utils::is_complex::value) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = + multiply_contig_row_contig_matrix_broadcast_impl; + return fn; + } + } + } +}; + +template +struct MultiplyInplaceFunctor +{ + + using supports_sg_loadstore = std::negation< + std::disjunction, tu_ns::is_complex>>; + using supports_vec = std::negation< + std::disjunction, tu_ns::is_complex>>; + + void operator()(resT &res, const argT &in) { res *= in; } + + template + void operator()(sycl::vec &res, + const sycl::vec &in) + { + res *= in; + } +}; + +template +using MultiplyInplaceContigFunctor = + elementwise_common::BinaryInplaceContigFunctor< + argT, + resT, + MultiplyInplaceFunctor, + vec_sz, + n_vecs, + enable_sg_loadstore>; + +template +using MultiplyInplaceStridedFunctor = + elementwise_common::BinaryInplaceStridedFunctor< + argT, + resT, + IndexerT, + MultiplyInplaceFunctor>; + +template +class multiply_inplace_contig_kernel; + +/* @brief Types supported by in-place multiplication */ +template +struct MultiplyInplaceTypePairSupport +{ + /* value if true a kernel for must be instantiated */ + static constexpr bool is_defined = std::disjunction< + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + resTy, + std::complex>, + td_ns::TypePairDefinedEntry, + resTy, + std::complex>, + // fall-through + td_ns::NotDefinedEntry>::is_defined; +}; + +template +struct MultiplyInplaceTypeMapFactory +{ + /*! @brief get typeid for output type of x *= y */ + std::enable_if_t::value, int> get() + { + if constexpr (MultiplyInplaceTypePairSupport::is_defined) { + return td_ns::GetTypeid{}.get(); + } + else { + return td_ns::GetTypeid{}.get(); + } + } +}; + +template +sycl::event + multiply_inplace_contig_impl(sycl::queue &exec_q, + std::size_t nelems, + const char *arg_p, + ssize_t arg_offset, + char *res_p, + ssize_t res_offset, + const std::vector &depends = {}) +{ + using MulHS = + hyperparam_detail::MultiplyContigHyperparameterSet; + static constexpr std::uint8_t vec_sz = MulHS::vec_sz; + static constexpr std::uint8_t n_vecs = MulHS::n_vecs; + + return elementwise_common::binary_inplace_contig_impl< + argTy, resTy, MultiplyInplaceContigFunctor, + multiply_inplace_contig_kernel, vec_sz, n_vecs>( + exec_q, nelems, arg_p, arg_offset, res_p, res_offset, depends); +} + +template +struct MultiplyInplaceContigFactory +{ + fnT get() + { + if constexpr (!MultiplyInplaceTypePairSupport::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = multiply_inplace_contig_impl; + return fn; + } + } +}; + +template +class multiply_inplace_strided_kernel; + +template +sycl::event multiply_inplace_strided_impl( + sycl::queue &exec_q, + std::size_t nelems, + int nd, + const ssize_t *shape_and_strides, + const char *arg_p, + ssize_t arg_offset, + char *res_p, + ssize_t res_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + return elementwise_common::binary_inplace_strided_impl< + argTy, resTy, MultiplyInplaceStridedFunctor, + multiply_inplace_strided_kernel>(exec_q, nelems, nd, shape_and_strides, + arg_p, arg_offset, res_p, res_offset, + depends, additional_depends); +} + +template +struct MultiplyInplaceStridedFactory +{ + fnT get() + { + if constexpr (!MultiplyInplaceTypePairSupport::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = multiply_inplace_strided_impl; + return fn; + } + } +}; + +template +class multiply_inplace_row_matrix_broadcast_sg_krn; + +template +using MultiplyInplaceRowMatrixBroadcastingFunctor = + elementwise_common::BinaryInplaceRowMatrixBroadcastingFunctor< + argT, + resT, + MultiplyInplaceFunctor>; + +template +sycl::event multiply_inplace_row_matrix_broadcast_impl( + sycl::queue &exec_q, + std::vector &host_tasks, + std::size_t n0, + std::size_t n1, + const char *vec_p, // typeless pointer to (n1,) contiguous row + ssize_t vec_offset, + char *mat_p, // typeless pointer to (n0, n1) C-contiguous matrix + ssize_t mat_offset, + const std::vector &depends = {}) +{ + return elementwise_common::binary_inplace_row_matrix_broadcast_impl< + argT, resT, MultiplyInplaceRowMatrixBroadcastingFunctor, + multiply_inplace_row_matrix_broadcast_sg_krn>( + exec_q, host_tasks, n0, n1, vec_p, vec_offset, mat_p, mat_offset, + depends); +} + +template +struct MultiplyInplaceRowMatrixBroadcastFactory +{ + fnT get() + { + if constexpr (!MultiplyInplaceTypePairSupport::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + if constexpr (dpctl::tensor::type_utils::is_complex::value || + dpctl::tensor::type_utils::is_complex::value) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = multiply_inplace_row_matrix_broadcast_impl; + return fn; + } + } + } +}; + +} // namespace dpctl::tensor::kernels::multiply diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/negative.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/negative.hpp new file mode 100644 index 000000000000..e0ac856a3818 --- /dev/null +++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/negative.hpp @@ -0,0 +1,219 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for elementwise evaluation of NEGATIVE(x) +/// function that returns -x. +//===---------------------------------------------------------------------===// + +#pragma once +#include +#include +#include +#include +#include + +#include + +#include "vec_size_util.hpp" + +#include "kernels/dpctl_tensor_types.hpp" +#include "kernels/elementwise_functions/common.hpp" + +#include "utils/type_dispatch_building.hpp" +#include "utils/type_utils.hpp" + +namespace dpctl::tensor::kernels::negative +{ + +using dpctl::tensor::ssize_t; +namespace td_ns = dpctl::tensor::type_dispatch; + +using dpctl::tensor::type_utils::is_complex; + +template +struct NegativeFunctor +{ + + using is_constant = typename std::false_type; + // constexpr resT constant_value = resT{}; + using supports_vec = typename std::false_type; + using supports_sg_loadstore = typename std::negation< + std::disjunction, is_complex>>; + + resT operator()(const argT &x) const { return -x; } +}; + +template +using NegativeContigFunctor = + elementwise_common::UnaryContigFunctor, + vec_sz, + n_vecs, + enable_sg_loadstore>; + +template +struct NegativeOutputType +{ + using value_type = typename std::disjunction< + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry>, + td_ns::TypeMapResultEntry>, + td_ns::DefaultResultEntry>::result_type; + + static constexpr bool is_defined = !std::is_same_v; +}; + +namespace hyperparam_detail +{ + +namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; + +using vsu_ns::ContigHyperparameterSetDefault; +using vsu_ns::UnaryContigHyperparameterSetEntry; + +template +struct NegativeContigHyperparameterSet +{ + using value_type = + typename std::disjunction>; + + constexpr static auto vec_sz = value_type::vec_sz; + constexpr static auto n_vecs = value_type::n_vecs; +}; + +} // end of namespace hyperparam_detail + +template +class negative_contig_kernel; + +template +sycl::event negative_contig_impl(sycl::queue &exec_q, + std::size_t nelems, + const char *arg_p, + char *res_p, + const std::vector &depends = {}) +{ + using NegHS = hyperparam_detail::NegativeContigHyperparameterSet; + static constexpr std::uint8_t vec_sz = NegHS::vec_sz; + static constexpr std::uint8_t n_vecs = NegHS::n_vecs; + + return elementwise_common::unary_contig_impl< + argTy, NegativeOutputType, NegativeContigFunctor, + negative_contig_kernel, vec_sz, n_vecs>(exec_q, nelems, arg_p, res_p, + depends); +} + +template +struct NegativeContigFactory +{ + fnT get() + { + if constexpr (!NegativeOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = negative_contig_impl; + return fn; + } + } +}; + +template +struct NegativeTypeMapFactory +{ + /*! @brief get typeid for output type of std::negative(T x) */ + std::enable_if_t::value, int> get() + { + using rT = typename NegativeOutputType::value_type; + return td_ns::GetTypeid{}.get(); + } +}; + +template +using NegativeStridedFunctor = elementwise_common:: + UnaryStridedFunctor>; + +template +class negative_strided_kernel; + +template +sycl::event + negative_strided_impl(sycl::queue &exec_q, + std::size_t nelems, + int nd, + const ssize_t *shape_and_strides, + const char *arg_p, + ssize_t arg_offset, + char *res_p, + ssize_t res_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + return elementwise_common::unary_strided_impl( + exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p, + res_offset, depends, additional_depends); +} + +template +struct NegativeStridedFactory +{ + fnT get() + { + if constexpr (!NegativeOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = negative_strided_impl; + return fn; + } + } +}; + +} // namespace dpctl::tensor::kernels::negative diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/nextafter.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/nextafter.hpp new file mode 100644 index 000000000000..a703892a7606 --- /dev/null +++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/nextafter.hpp @@ -0,0 +1,248 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for elementwise evaluation of NEXTAFTER(x1, x2) +/// function. +//===---------------------------------------------------------------------===// + +#pragma once +#include +#include +#include +#include + +#include + +#include "vec_size_util.hpp" + +#include "utils/offset_utils.hpp" +#include "utils/type_dispatch_building.hpp" +#include "utils/type_utils.hpp" + +#include "kernels/dpctl_tensor_types.hpp" +#include "kernels/elementwise_functions/common.hpp" + +namespace dpctl::tensor::kernels::nextafter +{ + +using dpctl::tensor::ssize_t; +namespace td_ns = dpctl::tensor::type_dispatch; +namespace tu_ns = dpctl::tensor::type_utils; + +template +struct NextafterFunctor +{ + + using supports_sg_loadstore = std::true_type; + using supports_vec = std::true_type; + + resT operator()(const argT1 &in1, const argT2 &in2) const + { + return sycl::nextafter(in1, in2); + } + + template + sycl::vec + operator()(const sycl::vec &in1, + const sycl::vec &in2) const + { + auto res = sycl::nextafter(in1, in2); + if constexpr (std::is_same_v) { + return res; + } + else { + using dpctl::tensor::type_utils::vec_cast; + + return vec_cast( + res); + } + } +}; + +template +using NextafterContigFunctor = elementwise_common::BinaryContigFunctor< + argT1, + argT2, + resT, + NextafterFunctor, + vec_sz, + n_vecs, + enable_sg_loadstore>; + +template +using NextafterStridedFunctor = elementwise_common::BinaryStridedFunctor< + argT1, + argT2, + resT, + IndexerT, + NextafterFunctor>; + +template +struct NextafterOutputType +{ + using value_type = typename std::disjunction< + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::DefaultResultEntry>::result_type; + + static constexpr bool is_defined = !std::is_same_v; +}; + +namespace hyperparam_detail +{ + +namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; + +using vsu_ns::BinaryContigHyperparameterSetEntry; +using vsu_ns::ContigHyperparameterSetDefault; + +template +struct NextafterContigHyperparameterSet +{ + using value_type = + typename std::disjunction>; + + constexpr static auto vec_sz = value_type::vec_sz; + constexpr static auto n_vecs = value_type::n_vecs; +}; + +} // end of namespace hyperparam_detail + +template +class nextafter_contig_kernel; + +template +sycl::event nextafter_contig_impl(sycl::queue &exec_q, + std::size_t nelems, + const char *arg1_p, + ssize_t arg1_offset, + const char *arg2_p, + ssize_t arg2_offset, + char *res_p, + ssize_t res_offset, + const std::vector &depends = {}) +{ + using NextafterHS = + hyperparam_detail::NextafterContigHyperparameterSet; + static constexpr std::uint8_t vec_sz = NextafterHS::vec_sz; + static constexpr std::uint8_t n_vecs = NextafterHS::n_vecs; + + return elementwise_common::binary_contig_impl< + argTy1, argTy2, NextafterOutputType, NextafterContigFunctor, + nextafter_contig_kernel, vec_sz, n_vecs>( + exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p, + res_offset, depends); +} + +template +struct NextafterContigFactory +{ + fnT get() + { + if constexpr (!NextafterOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = nextafter_contig_impl; + return fn; + } + } +}; + +template +struct NextafterTypeMapFactory +{ + /*! @brief get typeid for output type of std::nextafter(T1 x, T2 y) */ + std::enable_if_t::value, int> get() + { + using rT = typename NextafterOutputType::value_type; + return td_ns::GetTypeid{}.get(); + } +}; + +template +class nextafter_strided_kernel; + +template +sycl::event + nextafter_strided_impl(sycl::queue &exec_q, + std::size_t nelems, + int nd, + const ssize_t *shape_and_strides, + const char *arg1_p, + ssize_t arg1_offset, + const char *arg2_p, + ssize_t arg2_offset, + char *res_p, + ssize_t res_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + return elementwise_common::binary_strided_impl< + argTy1, argTy2, NextafterOutputType, NextafterStridedFunctor, + nextafter_strided_kernel>(exec_q, nelems, nd, shape_and_strides, arg1_p, + arg1_offset, arg2_p, arg2_offset, res_p, + res_offset, depends, additional_depends); +} + +template +struct NextafterStridedFactory +{ + fnT get() + { + if constexpr (!NextafterOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = nextafter_strided_impl; + return fn; + } + } +}; + +} // namespace dpctl::tensor::kernels::nextafter diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/not_equal.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/not_equal.hpp new file mode 100644 index 000000000000..007f374b6386 --- /dev/null +++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/not_equal.hpp @@ -0,0 +1,303 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for elementwise evaluation of inequality of +/// tensor elements. +//===---------------------------------------------------------------------===// + +#pragma once +#include +#include +#include +#include +#include + +#include + +#include "vec_size_util.hpp" + +#include "utils/offset_utils.hpp" +#include "utils/type_dispatch_building.hpp" +#include "utils/type_utils.hpp" + +#include "kernels/dpctl_tensor_types.hpp" +#include "kernels/elementwise_functions/common.hpp" + +namespace dpctl::tensor::kernels::not_equal +{ + +using dpctl::tensor::ssize_t; +namespace td_ns = dpctl::tensor::type_dispatch; +namespace tu_ns = dpctl::tensor::type_utils; + +template +struct NotEqualFunctor +{ + static_assert(std::is_same_v); + + using supports_sg_loadstore = std::negation< + std::disjunction, tu_ns::is_complex>>; + using supports_vec = std::conjunction< + std::is_same, + std::negation, + tu_ns::is_complex>>>; + + resT operator()(const argT1 &in1, const argT2 &in2) const + { + if constexpr (std::is_integral_v && std::is_integral_v && + std::is_signed_v != std::is_signed_v) { + if constexpr (std::is_signed_v && !std::is_signed_v) { + return (in1 < 0) ? true : (static_cast(in1) != in2); + } + else { + if constexpr (!std::is_signed_v && + std::is_signed_v) { + return (in2 < 0) ? true : (in1 != static_cast(in2)); + } + } + } + else { + return (in1 != in2); + } + } + + template + sycl::vec + operator()(const sycl::vec &in1, + const sycl::vec &in2) const + { + auto tmp = (in1 != in2); + if constexpr (std::is_same_v) { + return tmp; + } + else { + using dpctl::tensor::type_utils::vec_cast; + + return vec_cast( + tmp); + } + } +}; + +template +using NotEqualContigFunctor = + elementwise_common::BinaryContigFunctor, + vec_sz, + n_vecs, + enable_sg_loadstore>; + +template +using NotEqualStridedFunctor = elementwise_common::BinaryStridedFunctor< + argT1, + argT2, + resT, + IndexerT, + NotEqualFunctor>; + +template +struct NotEqualOutputType +{ + using value_type = typename std::disjunction< + td_ns::BinaryTypeMapResultEntry, + td_ns:: + BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns:: + BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns:: + BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns:: + BinaryTypeMapResultEntry, + td_ns:: + BinaryTypeMapResultEntry, + td_ns:: + BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + T2, + std::complex, + bool>, + td_ns::BinaryTypeMapResultEntry, + T2, + std::complex, + bool>, + td_ns::DefaultResultEntry>::result_type; + + static constexpr bool is_defined = !std::is_same_v; +}; + +namespace hyperparam_detail +{ + +namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; + +using vsu_ns::BinaryContigHyperparameterSetEntry; +using vsu_ns::ContigHyperparameterSetDefault; + +template +struct NotEqualContigHyperparameterSet +{ + using value_type = + typename std::disjunction>; + + constexpr static auto vec_sz = value_type::vec_sz; + constexpr static auto n_vecs = value_type::n_vecs; +}; + +} // end of namespace hyperparam_detail + +template +class not_equal_contig_kernel; + +template +sycl::event not_equal_contig_impl(sycl::queue &exec_q, + std::size_t nelems, + const char *arg1_p, + ssize_t arg1_offset, + const char *arg2_p, + ssize_t arg2_offset, + char *res_p, + ssize_t res_offset, + const std::vector &depends = {}) +{ + using NotEqHS = + hyperparam_detail::NotEqualContigHyperparameterSet; + static constexpr std::uint8_t vec_sz = NotEqHS::vec_sz; + static constexpr std::uint8_t n_vecs = NotEqHS::n_vecs; + + return elementwise_common::binary_contig_impl< + argTy1, argTy2, NotEqualOutputType, NotEqualContigFunctor, + not_equal_contig_kernel, vec_sz, n_vecs>( + exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p, + res_offset, depends); +} + +template +struct NotEqualContigFactory +{ + fnT get() + { + if constexpr (!NotEqualOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = not_equal_contig_impl; + return fn; + } + } +}; + +template +struct NotEqualTypeMapFactory +{ + /*! @brief get typeid for output type of operator()!=(x, y), always bool */ + std::enable_if_t::value, int> get() + { + using rT = typename NotEqualOutputType::value_type; + return td_ns::GetTypeid{}.get(); + } +}; + +template +class not_equal_strided_kernel; + +template +sycl::event + not_equal_strided_impl(sycl::queue &exec_q, + std::size_t nelems, + int nd, + const ssize_t *shape_and_strides, + const char *arg1_p, + ssize_t arg1_offset, + const char *arg2_p, + ssize_t arg2_offset, + char *res_p, + ssize_t res_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + return elementwise_common::binary_strided_impl< + argTy1, argTy2, NotEqualOutputType, NotEqualStridedFunctor, + not_equal_strided_kernel>(exec_q, nelems, nd, shape_and_strides, arg1_p, + arg1_offset, arg2_p, arg2_offset, res_p, + res_offset, depends, additional_depends); +} + +template +struct NotEqualStridedFactory +{ + fnT get() + { + if constexpr (!NotEqualOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = not_equal_strided_impl; + return fn; + } + } +}; + +} // namespace dpctl::tensor::kernels::not_equal diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/positive.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/positive.hpp new file mode 100644 index 000000000000..fb351b6e50d2 --- /dev/null +++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/positive.hpp @@ -0,0 +1,235 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for elementwise evaluation of POSITIVE(x) +/// function that returns +x. +//===---------------------------------------------------------------------===// + +#pragma once +#include +#include +#include +#include +#include + +#include + +#include "vec_size_util.hpp" + +#include "kernels/dpctl_tensor_types.hpp" +#include "kernels/elementwise_functions/common.hpp" + +#include "utils/type_dispatch_building.hpp" +#include "utils/type_utils.hpp" + +namespace dpctl::tensor::kernels::positive +{ + +using dpctl::tensor::ssize_t; +namespace td_ns = dpctl::tensor::type_dispatch; + +using dpctl::tensor::type_utils::is_complex; +using dpctl::tensor::type_utils::vec_cast; + +template +struct PositiveFunctor +{ + + using is_constant = typename std::false_type; + // constexpr resT constant_value = resT{}; + using supports_vec = typename std::negation< + std::disjunction, is_complex>>; + using supports_sg_loadstore = typename std::negation< + std::disjunction, is_complex>>; + + resT operator()(const argT &x) const { return x; } + + template + sycl::vec operator()(const sycl::vec &in) const + { + auto const &res_vec = in; + using deducedT = typename std::remove_cv_t< + std::remove_reference_t>::element_type; + if constexpr (std::is_same_v) { + return res_vec; + } + else { + return vec_cast(res_vec); + } + } +}; + +template +using PositiveContigFunctor = + elementwise_common::UnaryContigFunctor, + vec_sz, + n_vecs, + enable_sg_loadstore>; + +template +struct PositiveOutputType +{ + using value_type = typename std::disjunction< + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry>, + td_ns::TypeMapResultEntry>, + td_ns::DefaultResultEntry>::result_type; + + static constexpr bool is_defined = !std::is_same_v; +}; + +namespace hyperparam_detail +{ + +namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; + +using vsu_ns::ContigHyperparameterSetDefault; +using vsu_ns::UnaryContigHyperparameterSetEntry; + +template +struct PositiveContigHyperparameterSet +{ + using value_type = + typename std::disjunction>; + + constexpr static auto vec_sz = value_type::vec_sz; + constexpr static auto n_vecs = value_type::n_vecs; +}; + +} // end of namespace hyperparam_detail + +template +class positive_contig_kernel; + +template +sycl::event positive_contig_impl(sycl::queue &exec_q, + std::size_t nelems, + const char *arg_p, + char *res_p, + const std::vector &depends = {}) +{ + using PosHS = hyperparam_detail::PositiveContigHyperparameterSet; + static constexpr std::uint8_t vec_sz = PosHS::vec_sz; + static constexpr std::uint8_t n_vecs = PosHS::n_vecs; + + return elementwise_common::unary_contig_impl< + argTy, PositiveOutputType, PositiveContigFunctor, + positive_contig_kernel, vec_sz, n_vecs>(exec_q, nelems, arg_p, res_p, + depends); +} + +template +struct PositiveContigFactory +{ + fnT get() + { + if constexpr (!PositiveOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = positive_contig_impl; + return fn; + } + } +}; + +template +struct PositiveTypeMapFactory +{ + /*! @brief get typeid for output type of std::positive(T x) */ + std::enable_if_t::value, int> get() + { + using rT = typename PositiveOutputType::value_type; + return td_ns::GetTypeid{}.get(); + } +}; + +template +using PositiveStridedFunctor = elementwise_common:: + UnaryStridedFunctor>; + +template +class positive_strided_kernel; + +template +sycl::event + positive_strided_impl(sycl::queue &exec_q, + std::size_t nelems, + int nd, + const ssize_t *shape_and_strides, + const char *arg_p, + ssize_t arg_offset, + char *res_p, + ssize_t res_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + return elementwise_common::unary_strided_impl( + exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p, + res_offset, depends, additional_depends); +} + +template +struct PositiveStridedFactory +{ + fnT get() + { + if constexpr (!PositiveOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = positive_strided_impl; + return fn; + } + } +}; + +} // namespace dpctl::tensor::kernels::positive diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/pow.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/pow.hpp new file mode 100644 index 000000000000..1c669ec894d2 --- /dev/null +++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/pow.hpp @@ -0,0 +1,599 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for elementwise evaluation of POW(x1, x2) +/// function. +//===---------------------------------------------------------------------===// + +#pragma once +#include +#include +#include +#include +#include + +#include + +#include "sycl_complex.hpp" +#include "vec_size_util.hpp" + +#include "utils/offset_utils.hpp" +#include "utils/type_dispatch_building.hpp" +#include "utils/type_utils.hpp" + +#include "kernels/dpctl_tensor_types.hpp" +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/common_inplace.hpp" + +namespace dpctl::tensor::kernels::pow +{ + +using dpctl::tensor::ssize_t; +namespace td_ns = dpctl::tensor::type_dispatch; +namespace tu_ns = dpctl::tensor::type_utils; + +template +struct PowFunctor +{ + + using supports_sg_loadstore = std::negation< + std::disjunction, tu_ns::is_complex>>; + using supports_vec = std::negation< + std::disjunction, tu_ns::is_complex>>; + + resT operator()(const argT1 &in1, const argT2 &in2) const + { + if constexpr (std::is_integral_v || std::is_integral_v) { + auto tmp1 = in1; + auto tmp2 = in2; + if constexpr (std::is_signed_v) { + if (tmp2 < 0) { + // invalid; return 0 + return resT(0); + } + } + resT res = 1; + if (tmp1 == 1 || tmp2 == 0) { + return res; + } + while (tmp2 > 0) { + if (tmp2 & 1) { + res *= tmp1; + } + tmp2 >>= 1; + tmp1 *= tmp1; + } + return res; + } + else if constexpr (tu_ns::is_complex::value && + tu_ns::is_complex::value) { + using realT1 = typename argT1::value_type; + using realT2 = typename argT2::value_type; + + return exprm_ns::pow(exprm_ns::complex(in1), + exprm_ns::complex(in2)); + } + else { + return sycl::pow(in1, in2); + } + } + + template + sycl::vec + operator()(const sycl::vec &in1, + const sycl::vec &in2) const + { + if constexpr (std::is_integral_v || std::is_integral_v) { + sycl::vec res; +#pragma unroll + for (int i = 0; i < vec_sz; ++i) { + auto tmp1 = in1[i]; + auto tmp2 = in2[i]; + if constexpr (std::is_signed_v) { + if (tmp2 < 0) { + // invalid; yield 0 + res[i] = 0; + continue; + } + } + resT res_tmp = 1; + if (tmp1 == 1 || tmp2 == 0) { + res[i] = res_tmp; + continue; + } + while (tmp2 > 0) { + if (tmp2 & 1) { + res_tmp *= tmp1; + } + tmp2 >>= 1; + tmp1 *= tmp1; + } + res[i] = res_tmp; + } + return res; + } + else { + auto res = sycl::pow(in1, in2); + if constexpr (std::is_same_v< + resT, typename decltype(res)::element_type>) { + return res; + } + else { + using dpctl::tensor::type_utils::vec_cast; + + return vec_cast(res); + } + } + } +}; + +template +using PowContigFunctor = + elementwise_common::BinaryContigFunctor, + vec_sz, + n_vecs, + enable_sg_loadstore>; + +template +using PowStridedFunctor = + elementwise_common::BinaryStridedFunctor>; + +template +struct PowOutputType +{ + using value_type = typename std::disjunction< + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + T2, + std::complex, + std::complex>, + td_ns::BinaryTypeMapResultEntry, + T2, + std::complex, + std::complex>, + td_ns::DefaultResultEntry>::result_type; + + static constexpr bool is_defined = !std::is_same_v; +}; + +namespace hyperparam_detail +{ + +namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; + +using vsu_ns::BinaryContigHyperparameterSetEntry; +using vsu_ns::ContigHyperparameterSetDefault; + +template +struct PowContigHyperparameterSet +{ + using value_type = + typename std::disjunction>; + + constexpr static auto vec_sz = value_type::vec_sz; + constexpr static auto n_vecs = value_type::n_vecs; +}; + +} // end of namespace hyperparam_detail + +template +class pow_contig_kernel; + +template +sycl::event pow_contig_impl(sycl::queue &exec_q, + std::size_t nelems, + const char *arg1_p, + ssize_t arg1_offset, + const char *arg2_p, + ssize_t arg2_offset, + char *res_p, + ssize_t res_offset, + const std::vector &depends = {}) +{ + using PowHS = hyperparam_detail::PowContigHyperparameterSet; + static constexpr std::uint8_t vec_sz = PowHS::vec_sz; + static constexpr std::uint8_t n_vecs = PowHS::n_vecs; + + return elementwise_common::binary_contig_impl< + argTy1, argTy2, PowOutputType, PowContigFunctor, pow_contig_kernel, + vec_sz, n_vecs>(exec_q, nelems, arg1_p, arg1_offset, arg2_p, + arg2_offset, res_p, res_offset, depends); +} + +template +struct PowContigFactory +{ + fnT get() + { + if constexpr (!PowOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = pow_contig_impl; + return fn; + } + } +}; + +template +struct PowTypeMapFactory +{ + /*! @brief get typeid for output type of std::pow(T1 x, T2 y) */ + std::enable_if_t::value, int> get() + { + using rT = typename PowOutputType::value_type; + return td_ns::GetTypeid{}.get(); + } +}; + +template +class pow_strided_kernel; + +template +sycl::event pow_strided_impl(sycl::queue &exec_q, + std::size_t nelems, + int nd, + const ssize_t *shape_and_strides, + const char *arg1_p, + ssize_t arg1_offset, + const char *arg2_p, + ssize_t arg2_offset, + char *res_p, + ssize_t res_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + return elementwise_common::binary_strided_impl< + argTy1, argTy2, PowOutputType, PowStridedFunctor, pow_strided_kernel>( + exec_q, nelems, nd, shape_and_strides, arg1_p, arg1_offset, arg2_p, + arg2_offset, res_p, res_offset, depends, additional_depends); +} + +template +struct PowStridedFactory +{ + fnT get() + { + if constexpr (!PowOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = pow_strided_impl; + return fn; + } + } +}; + +template +struct PowInplaceFunctor +{ + + using supports_sg_loadstore = std::negation< + std::disjunction, tu_ns::is_complex>>; + using supports_vec = std::negation< + std::disjunction, tu_ns::is_complex>>; + + void operator()(resT &res, const argT &in) + { + if constexpr (std::is_integral_v || std::is_integral_v) { + auto tmp1 = res; + auto tmp2 = in; + if constexpr (std::is_signed_v) { + if (tmp2 < 0) { + // invalid; return 0 + res = 0; + return; + } + } + if (tmp1 == 1) { + return; + } + if (tmp2 == 0) { + res = 1; + return; + } + resT res_tmp = 1; + while (tmp2 > 0) { + if (tmp2 & 1) { + res_tmp *= tmp1; + } + tmp2 >>= 1; + tmp1 *= tmp1; + } + res = res_tmp; + } + else if constexpr (tu_ns::is_complex::value && + tu_ns::is_complex::value) { + using r_resT = typename resT::value_type; + using r_argT = typename argT::value_type; + + res = exprm_ns::pow(exprm_ns::complex(res), + exprm_ns::complex(in)); + } + else { + res = sycl::pow(res, in); + } + return; + } + + template + void operator()(sycl::vec &res, + const sycl::vec &in) + { + if constexpr (std::is_integral_v || std::is_integral_v) { +#pragma unroll + for (int i = 0; i < vec_sz; ++i) { + auto tmp1 = res[i]; + auto tmp2 = in[i]; + if constexpr (std::is_signed_v) { + if (tmp2 < 0) { + // invalid; return 0 + res[i] = 0; + continue; + } + } + if (tmp1 == 1) { + continue; + } + if (tmp2 == 0) { + res[i] = 1; + continue; + } + resT res_tmp = 1; + while (tmp2 > 0) { + if (tmp2 & 1) { + res_tmp *= tmp1; + } + tmp2 >>= 1; + tmp1 *= tmp1; + } + res[i] = res_tmp; + } + } + else { + res = sycl::pow(res, in); + } + } +}; + +template +using PowInplaceContigFunctor = elementwise_common::BinaryInplaceContigFunctor< + argT, + resT, + PowInplaceFunctor, + vec_sz, + n_vecs, + enable_sg_loadstore>; + +template +using PowInplaceStridedFunctor = + elementwise_common::BinaryInplaceStridedFunctor< + argT, + resT, + IndexerT, + PowInplaceFunctor>; + +template +class pow_inplace_contig_kernel; + +/* @brief Types supported by in-place pow */ +template +struct PowInplaceTypePairSupport +{ + /* value if true a kernel for must be instantiated */ + static constexpr bool is_defined = std::disjunction< + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + resTy, + std::complex>, + td_ns::TypePairDefinedEntry, + resTy, + std::complex>, + // fall-through + td_ns::NotDefinedEntry>::is_defined; +}; + +template +struct PowInplaceTypeMapFactory +{ + /*! @brief get typeid for output type of x **= y */ + std::enable_if_t::value, int> get() + { + if constexpr (PowInplaceTypePairSupport::is_defined) { + return td_ns::GetTypeid{}.get(); + } + else { + return td_ns::GetTypeid{}.get(); + } + } +}; + +template +sycl::event + pow_inplace_contig_impl(sycl::queue &exec_q, + std::size_t nelems, + const char *arg_p, + ssize_t arg_offset, + char *res_p, + ssize_t res_offset, + const std::vector &depends = {}) +{ + using PowHS = hyperparam_detail::PowContigHyperparameterSet; + static constexpr std::uint8_t vec_sz = PowHS::vec_sz; + static constexpr std::uint8_t n_vecs = PowHS::n_vecs; + + return elementwise_common::binary_inplace_contig_impl< + argTy, resTy, PowInplaceContigFunctor, pow_inplace_contig_kernel, + vec_sz, n_vecs>(exec_q, nelems, arg_p, arg_offset, res_p, res_offset, + depends); +} + +template +struct PowInplaceContigFactory +{ + fnT get() + { + if constexpr (!PowInplaceTypePairSupport::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = pow_inplace_contig_impl; + return fn; + } + } +}; + +template +class pow_inplace_strided_kernel; + +template +sycl::event + pow_inplace_strided_impl(sycl::queue &exec_q, + std::size_t nelems, + int nd, + const ssize_t *shape_and_strides, + const char *arg_p, + ssize_t arg_offset, + char *res_p, + ssize_t res_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + return elementwise_common::binary_inplace_strided_impl< + argTy, resTy, PowInplaceStridedFunctor, pow_inplace_strided_kernel>( + exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p, + res_offset, depends, additional_depends); +} + +template +struct PowInplaceStridedFactory +{ + fnT get() + { + if constexpr (!PowInplaceTypePairSupport::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = pow_inplace_strided_impl; + return fn; + } + } +}; + +} // namespace dpctl::tensor::kernels::pow diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/proj.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/proj.hpp new file mode 100644 index 000000000000..039da657cfd2 --- /dev/null +++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/proj.hpp @@ -0,0 +1,239 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for elementwise evaluation of PROJ(x) function. +//===---------------------------------------------------------------------===// + +#pragma once +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "vec_size_util.hpp" + +#include "kernels/dpctl_tensor_types.hpp" +#include "kernels/elementwise_functions/common.hpp" + +#include "utils/type_dispatch_building.hpp" +#include "utils/type_utils.hpp" + +namespace dpctl::tensor::kernels::proj +{ + +using dpctl::tensor::ssize_t; +namespace td_ns = dpctl::tensor::type_dispatch; + +using dpctl::tensor::type_utils::is_complex; + +template +struct ProjFunctor +{ + + // is function constant for given argT + using is_constant = typename std::false_type; + // constant value, if constant + // constexpr resT constant_value = resT{}; + // is function defined for sycl::vec + using supports_vec = typename std::false_type; + // do both argTy and resTy support sugroup store/load operation + using supports_sg_loadstore = typename std::false_type; + + resT operator()(const argT &in) const + { + using realT = typename argT::value_type; + const realT x = std::real(in); + const realT y = std::imag(in); + + if (std::isinf(x)) { + return value_at_infinity(y); + } + else if (std::isinf(y)) { + return value_at_infinity(y); + } + else { + return in; + } + } + +private: + template + std::complex value_at_infinity(const T &y) const + { + const T res_im = sycl::copysign(T(0), y); + return std::complex{std::numeric_limits::infinity(), res_im}; + } +}; + +template +using ProjContigFunctor = + elementwise_common::UnaryContigFunctor, + vec_sz, + n_vecs, + enable_sg_loadstore>; + +template +using ProjStridedFunctor = elementwise_common:: + UnaryStridedFunctor>; + +template +struct ProjOutputType +{ + using value_type = typename std::disjunction< + td_ns::TypeMapResultEntry>, + td_ns::TypeMapResultEntry>, + td_ns::DefaultResultEntry>::result_type; + + static constexpr bool is_defined = !std::is_same_v; +}; + +namespace hyperparam_detail +{ + +namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; + +using vsu_ns::ContigHyperparameterSetDefault; +using vsu_ns::UnaryContigHyperparameterSetEntry; + +template +struct ProjContigHyperparameterSet +{ + using value_type = + typename std::disjunction>; + + constexpr static auto vec_sz = value_type::vec_sz; + constexpr static auto n_vecs = value_type::n_vecs; +}; + +} // end of namespace hyperparam_detail + +template +class proj_contig_kernel; + +template +sycl::event proj_contig_impl(sycl::queue &exec_q, + std::size_t nelems, + const char *arg_p, + char *res_p, + const std::vector &depends = {}) +{ + using ProjHS = hyperparam_detail::ProjContigHyperparameterSet; + static constexpr std::uint8_t vec_sz = ProjHS::vec_sz; + static constexpr std::uint8_t n_vecs = ProjHS::n_vecs; + + return elementwise_common::unary_contig_impl< + argTy, ProjOutputType, ProjContigFunctor, proj_contig_kernel, vec_sz, + n_vecs>(exec_q, nelems, arg_p, res_p, depends); +} + +template +struct ProjContigFactory +{ + fnT get() + { + if constexpr (!ProjOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + if constexpr (std::is_same_v>) { + fnT fn = proj_contig_impl; + return fn; + } + else { + fnT fn = proj_contig_impl; + return fn; + } + } + } +}; + +template +struct ProjTypeMapFactory +{ + /*! @brief get typeid for output type of std::proj(T x) */ + std::enable_if_t::value, int> get() + { + using rT = typename ProjOutputType::value_type; + return td_ns::GetTypeid{}.get(); + } +}; + +template +class proj_strided_kernel; + +template +sycl::event + proj_strided_impl(sycl::queue &exec_q, + std::size_t nelems, + int nd, + const ssize_t *shape_and_strides, + const char *arg_p, + ssize_t arg_offset, + char *res_p, + ssize_t res_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + return elementwise_common::unary_strided_impl< + argTy, ProjOutputType, ProjStridedFunctor, proj_strided_kernel>( + exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p, + res_offset, depends, additional_depends); +} + +template +struct ProjStridedFactory +{ + fnT get() + { + if constexpr (!ProjOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = proj_strided_impl; + return fn; + } + } +}; + +} // namespace dpctl::tensor::kernels::proj diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/real.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/real.hpp new file mode 100644 index 000000000000..d21a9e6baa7d --- /dev/null +++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/real.hpp @@ -0,0 +1,231 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for elementwise evaluation of REAL(x) function. +//===---------------------------------------------------------------------===// + +#pragma once +#include +#include +#include +#include +#include + +#include + +#include "vec_size_util.hpp" + +#include "kernels/dpctl_tensor_types.hpp" +#include "kernels/elementwise_functions/common.hpp" + +#include "utils/type_dispatch_building.hpp" +#include "utils/type_utils.hpp" + +namespace dpctl::tensor::kernels::real +{ + +using dpctl::tensor::ssize_t; +namespace td_ns = dpctl::tensor::type_dispatch; + +using dpctl::tensor::type_utils::is_complex; +using dpctl::tensor::type_utils::is_complex_v; + +template +struct RealFunctor +{ + + // is function constant for given argT + using is_constant = typename std::false_type; + // constant value, if constant + // constexpr resT constant_value = resT{}; + // is function defined for sycl::vec + using supports_vec = typename std::false_type; + // do both argTy and resTy support sugroup store/load operation + using supports_sg_loadstore = typename std::negation< + std::disjunction, is_complex>>; + + resT operator()(const argT &in) const + { + if constexpr (is_complex_v) { + return std::real(in); + } + else { + static_assert(std::is_same_v); + return in; + } + } +}; + +template +using RealContigFunctor = + elementwise_common::UnaryContigFunctor, + vec_sz, + n_vecs, + enable_sg_loadstore>; + +template +using RealStridedFunctor = elementwise_common:: + UnaryStridedFunctor>; + +template +struct RealOutputType +{ + using value_type = typename std::disjunction< + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, float>, + td_ns::TypeMapResultEntry, double>, + td_ns::DefaultResultEntry>::result_type; + + static constexpr bool is_defined = !std::is_same_v; +}; + +namespace hyperparam_detail +{ + +namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; + +using vsu_ns::ContigHyperparameterSetDefault; +using vsu_ns::UnaryContigHyperparameterSetEntry; + +template +struct RealContigHyperparameterSet +{ + using value_type = + typename std::disjunction>; + + constexpr static auto vec_sz = value_type::vec_sz; + constexpr static auto n_vecs = value_type::n_vecs; +}; + +} // end of namespace hyperparam_detail + +template +class real_contig_kernel; + +template +sycl::event real_contig_impl(sycl::queue &exec_q, + std::size_t nelems, + const char *arg_p, + char *res_p, + const std::vector &depends = {}) +{ + using RealHS = hyperparam_detail::RealContigHyperparameterSet; + static constexpr std::uint8_t vec_sz = RealHS::vec_sz; + static constexpr std::uint8_t n_vecs = RealHS::n_vecs; + + return elementwise_common::unary_contig_impl< + argTy, RealOutputType, RealContigFunctor, real_contig_kernel, vec_sz, + n_vecs>(exec_q, nelems, arg_p, res_p, depends); +} + +template +struct RealContigFactory +{ + fnT get() + { + if constexpr (!RealOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = real_contig_impl; + return fn; + } + } +}; + +template +struct RealTypeMapFactory +{ + /*! @brief get typeid for output type of std::real(T x) */ + std::enable_if_t::value, int> get() + { + using rT = typename RealOutputType::value_type; + return td_ns::GetTypeid{}.get(); + } +}; + +template +class real_strided_kernel; + +template +sycl::event + real_strided_impl(sycl::queue &exec_q, + std::size_t nelems, + int nd, + const ssize_t *shape_and_strides, + const char *arg_p, + ssize_t arg_offset, + char *res_p, + ssize_t res_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + return elementwise_common::unary_strided_impl< + argTy, RealOutputType, RealStridedFunctor, real_strided_kernel>( + exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p, + res_offset, depends, additional_depends); +} + +template +struct RealStridedFactory +{ + fnT get() + { + if constexpr (!RealOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = real_strided_impl; + return fn; + } + } +}; + +} // namespace dpctl::tensor::kernels::real diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/reciprocal.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/reciprocal.hpp new file mode 100644 index 000000000000..f26f4043c9ab --- /dev/null +++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/reciprocal.hpp @@ -0,0 +1,229 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for elementwise evaluation of RECIPROCAL(x) +/// function. +//===---------------------------------------------------------------------===// + +#pragma once +#include +#include +#include +#include +#include + +#include + +#include "sycl_complex.hpp" +#include "vec_size_util.hpp" + +#include "kernels/dpctl_tensor_types.hpp" +#include "kernels/elementwise_functions/common.hpp" + +#include "utils/type_dispatch_building.hpp" +#include "utils/type_utils.hpp" + +namespace dpctl::tensor::kernels::reciprocal +{ + +using dpctl::tensor::ssize_t; +namespace td_ns = dpctl::tensor::type_dispatch; + +using dpctl::tensor::type_utils::is_complex; + +template +struct ReciprocalFunctor +{ + // is function constant for given argT + using is_constant = typename std::false_type; + // constant value, if constant + // constexpr resT constant_value = resT{}; + // is function defined for sycl::vec + using supports_vec = typename std::false_type; + // do both argTy and resTy support sugroup store/load operation + using supports_sg_loadstore = typename std::negation< + std::disjunction, is_complex>>; + + resT operator()(const argT &in) const + { + if constexpr (is_complex::value) { + + using realT = typename argT::value_type; + + return realT(1) / exprm_ns::complex(in); + } + else { + return argT(1) / in; + } + } +}; + +template +using ReciprocalContigFunctor = + elementwise_common::UnaryContigFunctor, + vec_sz, + n_vecs, + enable_sg_loadstore>; + +template +using ReciprocalStridedFunctor = + elementwise_common::UnaryStridedFunctor>; + +template +struct ReciprocalOutputType +{ + using value_type = typename std::disjunction< + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry>, + td_ns::TypeMapResultEntry>, + td_ns::DefaultResultEntry>::result_type; + + static constexpr bool is_defined = !std::is_same_v; +}; + +namespace hyperparam_detail +{ + +namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; + +using vsu_ns::ContigHyperparameterSetDefault; +using vsu_ns::UnaryContigHyperparameterSetEntry; + +template +struct ReciprocalContigHyperparameterSet +{ + using value_type = + typename std::disjunction>; + + constexpr static auto vec_sz = value_type::vec_sz; + constexpr static auto n_vecs = value_type::n_vecs; +}; + +} // end of namespace hyperparam_detail + +template +class reciprocal_contig_kernel; + +template +sycl::event reciprocal_contig_impl(sycl::queue &exec_q, + std::size_t nelems, + const char *arg_p, + char *res_p, + const std::vector &depends = {}) +{ + using RecipHS = hyperparam_detail::ReciprocalContigHyperparameterSet; + static constexpr std::uint8_t vec_sz = RecipHS::vec_sz; + static constexpr std::uint8_t n_vecs = RecipHS::n_vecs; + + return elementwise_common::unary_contig_impl< + argTy, ReciprocalOutputType, ReciprocalContigFunctor, + reciprocal_contig_kernel, vec_sz, n_vecs>(exec_q, nelems, arg_p, res_p, + depends); +} + +template +struct ReciprocalContigFactory +{ + fnT get() + { + if constexpr (!ReciprocalOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = reciprocal_contig_impl; + return fn; + } + } +}; + +template +struct ReciprocalTypeMapFactory +{ + /*! @brief get typeid for output type of 1 / x */ + std::enable_if_t::value, int> get() + { + using rT = typename ReciprocalOutputType::value_type; + return td_ns::GetTypeid{}.get(); + } +}; + +template +class reciprocal_strided_kernel; + +template +sycl::event + reciprocal_strided_impl(sycl::queue &exec_q, + std::size_t nelems, + int nd, + const ssize_t *shape_and_strides, + const char *arg_p, + ssize_t arg_offset, + char *res_p, + ssize_t res_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + return elementwise_common::unary_strided_impl( + exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p, + res_offset, depends, additional_depends); +} + +template +struct ReciprocalStridedFactory +{ + fnT get() + { + if constexpr (!ReciprocalOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = reciprocal_strided_impl; + return fn; + } + } +}; + +} // namespace dpctl::tensor::kernels::reciprocal diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/remainder.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/remainder.hpp new file mode 100644 index 000000000000..65cd97dbe56d --- /dev/null +++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/remainder.hpp @@ -0,0 +1,572 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for elementwise evaluation of REMAINDER(x1, x2) +/// function that computes the Python modulus operator, which is specifically +/// designed as the complement to floor_divide(x1, x2). +//===---------------------------------------------------------------------===// + +#pragma once +#include +#include +#include +#include +#include + +#include "vec_size_util.hpp" + +#include "utils/offset_utils.hpp" +#include "utils/type_dispatch_building.hpp" +#include "utils/type_utils.hpp" + +#include "kernels/dpctl_tensor_types.hpp" +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/common_inplace.hpp" + +namespace dpctl::tensor::kernels::remainder +{ + +using dpctl::tensor::ssize_t; +namespace td_ns = dpctl::tensor::type_dispatch; +namespace tu_ns = dpctl::tensor::type_utils; + +template +struct RemainderFunctor +{ + static_assert(std::is_same_v); + using supports_sg_loadstore = std::true_type; + using supports_vec = std::true_type; + + resT operator()(const argT1 &in1, const argT2 &in2) const + { + if constexpr (std::is_integral_v || std::is_integral_v) { + if (in2 == argT2(0)) { + return resT(0); + } + if constexpr (std::is_signed_v || std::is_signed_v) { + auto out = (in1 % in2); + if (out != 0 && l_xor(in1 < 0, in2 < 0)) { + out += in2; + } + return out; + } + else { + return (in1 % in2); + } + } + else { + auto rem = sycl::fmod(in1, in2); + if (rem) { + if (l_xor(in2 < 0, rem < 0)) { + rem += in2; + } + } + else { + rem = sycl::copysign(resT(0), in2); + } + return rem; + } + } + + template + sycl::vec + operator()(const sycl::vec &in1, + const sycl::vec &in2) const + { + if constexpr (std::is_integral_v || std::is_integral_v) { + sycl::vec rem; +#pragma unroll + for (auto i = 0; i < vec_sz; ++i) { + if (in2[i] == argT2(0)) { + rem[i] = resT(0); + } + else { + rem[i] = in1[i] % in2[i]; + if constexpr (std::is_signed_v || + std::is_signed_v) { + if (rem[i] != 0 && l_xor(in1[i] < 0, in2[i] < 0)) { + rem[i] += in2[i]; + } + } + } + } + return rem; + } + else { + auto rem = sycl::fmod(in1, in2); + using remT = typename decltype(rem)::element_type; +#pragma unroll + for (auto i = 0; i < vec_sz; ++i) { + if (rem[i]) { + if (l_xor(in2[i] < 0, rem[i] < 0)) { + rem[i] += in2[i]; + } + } + else { + rem[i] = sycl::copysign(remT(0), in2[i]); + } + } + if constexpr (std::is_same_v) { + return rem; + } + else { + using dpctl::tensor::type_utils::vec_cast; + + return vec_cast(rem); + } + } + } + +private: + bool l_xor(bool b1, bool b2) const { return (b1 != b2); } +}; + +template +using RemainderContigFunctor = elementwise_common::BinaryContigFunctor< + argT1, + argT2, + resT, + RemainderFunctor, + vec_sz, + n_vecs, + enable_sg_loadstore>; + +template +using RemainderStridedFunctor = elementwise_common::BinaryStridedFunctor< + argT1, + argT2, + resT, + IndexerT, + RemainderFunctor>; + +template +struct RemainderOutputType +{ + using value_type = typename std::disjunction< + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::DefaultResultEntry>::result_type; + + static constexpr bool is_defined = !std::is_same_v; +}; + +namespace hyperparam_detail +{ + +namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; + +using vsu_ns::BinaryContigHyperparameterSetEntry; +using vsu_ns::ContigHyperparameterSetDefault; + +template +struct RemainderContigHyperparameterSet +{ + using value_type = + typename std::disjunction>; + + constexpr static auto vec_sz = value_type::vec_sz; + constexpr static auto n_vecs = value_type::n_vecs; +}; + +} // end of namespace hyperparam_detail + +template +class remainder_contig_kernel; + +template +sycl::event remainder_contig_impl(sycl::queue &exec_q, + std::size_t nelems, + const char *arg1_p, + ssize_t arg1_offset, + const char *arg2_p, + ssize_t arg2_offset, + char *res_p, + ssize_t res_offset, + const std::vector &depends = {}) +{ + using RemHS = + hyperparam_detail::RemainderContigHyperparameterSet; + static constexpr std::uint8_t vec_sz = RemHS::vec_sz; + static constexpr std::uint8_t n_vecs = RemHS::n_vecs; + + return elementwise_common::binary_contig_impl< + argTy1, argTy2, RemainderOutputType, RemainderContigFunctor, + remainder_contig_kernel, vec_sz, n_vecs>( + exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p, + res_offset, depends); +} + +template +struct RemainderContigFactory +{ + fnT get() + { + if constexpr (!RemainderOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = remainder_contig_impl; + return fn; + } + } +}; + +template +struct RemainderTypeMapFactory +{ + /*! @brief get typeid for output type of remainder(T x, T y) */ + std::enable_if_t::value, int> get() + { + using rT = typename RemainderOutputType::value_type; + return td_ns::GetTypeid{}.get(); + } +}; + +template +class remainder_strided_kernel; + +template +sycl::event + remainder_strided_impl(sycl::queue &exec_q, + std::size_t nelems, + int nd, + const ssize_t *shape_and_strides, + const char *arg1_p, + ssize_t arg1_offset, + const char *arg2_p, + ssize_t arg2_offset, + char *res_p, + ssize_t res_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + return elementwise_common::binary_strided_impl< + argTy1, argTy2, RemainderOutputType, RemainderStridedFunctor, + remainder_strided_kernel>(exec_q, nelems, nd, shape_and_strides, arg1_p, + arg1_offset, arg2_p, arg2_offset, res_p, + res_offset, depends, additional_depends); +} + +template +struct RemainderStridedFactory +{ + fnT get() + { + if constexpr (!RemainderOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = remainder_strided_impl; + return fn; + } + } +}; + +template +struct RemainderInplaceFunctor +{ + + using supports_sg_loadstore = std::true_type; + using supports_vec = std::true_type; + + // functor is only well-defined when argT and resT are the same + static_assert(std::is_same_v); + + void operator()(resT &res, const argT &in) + { + if constexpr (std::is_integral_v || std::is_integral_v) { + if (in == argT(0)) { + res = 0; + return; + } + if constexpr (std::is_signed_v || std::is_signed_v) { + auto tmp = res; + res %= in; + if (res != resT(0) && l_xor(tmp < 0, in < 0)) { + res += in; + } + } + else { + res %= in; + } + } + else { + res = sycl::fmod(res, in); + if (res) { + if (l_xor(in < 0, res < 0)) { + res += in; + } + } + else { + res = sycl::copysign(resT(0), in); + } + } + } + + template + void operator()(sycl::vec &res, + const sycl::vec &in) + { + if constexpr (std::is_integral_v || std::is_integral_v) { +#pragma unroll + for (auto i = 0; i < vec_sz; ++i) { + if (in[i] == argT(0)) { + res[i] = 0; + } + else { + auto rem = res[i] % in[i]; + if constexpr (std::is_signed_v || + std::is_signed_v) { + if (rem != 0 && l_xor(res[i] < 0, in[i] < 0)) { + rem += in[i]; + } + } + res[i] = rem; + } + } + } + else { + res = sycl::fmod(res, in); +#pragma unroll + for (auto i = 0; i < vec_sz; ++i) { + if (res[i]) { + if (l_xor(in[i] < 0, res[i] < 0)) { + res[i] += in[i]; + } + } + else { + res[i] = sycl::copysign(resT(0), in[i]); + } + } + } + } + +private: + bool l_xor(bool b1, bool b2) const { return (b1 != b2); } +}; + +template +using RemainderInplaceContigFunctor = + elementwise_common::BinaryInplaceContigFunctor< + argT, + resT, + RemainderInplaceFunctor, + vec_sz, + n_vecs, + enable_sg_loadstore>; + +template +using RemainderInplaceStridedFunctor = + elementwise_common::BinaryInplaceStridedFunctor< + argT, + resT, + IndexerT, + RemainderInplaceFunctor>; + +template +class remainder_inplace_contig_kernel; + +/* @brief Types supported by in-place remainder */ +template +struct RemainderInplaceTypePairSupport +{ + /* value if true a kernel for must be instantiated */ + static constexpr bool is_defined = std::disjunction< + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + // fall-through + td_ns::NotDefinedEntry>::is_defined; +}; + +template +struct RemainderInplaceTypeMapFactory +{ + /*! @brief get typeid for output type of x %= y */ + std::enable_if_t::value, int> get() + { + if constexpr (RemainderInplaceTypePairSupport::is_defined) { + return td_ns::GetTypeid{}.get(); + } + else { + return td_ns::GetTypeid{}.get(); + } + } +}; + +template +sycl::event + remainder_inplace_contig_impl(sycl::queue &exec_q, + std::size_t nelems, + const char *arg_p, + ssize_t arg_offset, + char *res_p, + ssize_t res_offset, + const std::vector &depends = {}) +{ + using RemHS = + hyperparam_detail::RemainderContigHyperparameterSet; + static constexpr std::uint8_t vec_sz = RemHS::vec_sz; + static constexpr std::uint8_t n_vecs = RemHS::n_vecs; + + return elementwise_common::binary_inplace_contig_impl< + argTy, resTy, RemainderInplaceContigFunctor, + remainder_inplace_contig_kernel, vec_sz, n_vecs>( + exec_q, nelems, arg_p, arg_offset, res_p, res_offset, depends); +} + +template +struct RemainderInplaceContigFactory +{ + fnT get() + { + if constexpr (!RemainderInplaceTypePairSupport::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = remainder_inplace_contig_impl; + return fn; + } + } +}; + +template +class remainder_inplace_strided_kernel; + +template +sycl::event remainder_inplace_strided_impl( + sycl::queue &exec_q, + std::size_t nelems, + int nd, + const ssize_t *shape_and_strides, + const char *arg_p, + ssize_t arg_offset, + char *res_p, + ssize_t res_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + return elementwise_common::binary_inplace_strided_impl< + argTy, resTy, RemainderInplaceStridedFunctor, + remainder_inplace_strided_kernel>(exec_q, nelems, nd, shape_and_strides, + arg_p, arg_offset, res_p, res_offset, + depends, additional_depends); +} + +template +struct RemainderInplaceStridedFactory +{ + fnT get() + { + if constexpr (!RemainderInplaceTypePairSupport::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = remainder_inplace_strided_impl; + return fn; + } + } +}; + +} // namespace dpctl::tensor::kernels::remainder diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/round.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/round.hpp new file mode 100644 index 000000000000..b20166a4d505 --- /dev/null +++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/round.hpp @@ -0,0 +1,241 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for elementwise evaluation of ROUND(x) function. +//===---------------------------------------------------------------------===// + +#pragma once +#include +#include +#include +#include +#include + +#include + +#include "vec_size_util.hpp" + +#include "kernels/dpctl_tensor_types.hpp" +#include "kernels/elementwise_functions/common.hpp" + +#include "utils/type_dispatch_building.hpp" +#include "utils/type_utils.hpp" + +namespace dpctl::tensor::kernels::round +{ + +using dpctl::tensor::ssize_t; +namespace td_ns = dpctl::tensor::type_dispatch; + +using dpctl::tensor::type_utils::is_complex; + +template +struct RoundFunctor +{ + + // is function constant for given argT + using is_constant = typename std::false_type; + // constant value, if constant + // constexpr resT constant_value = resT{}; + // is function defined for sycl::vec + using supports_vec = typename std::false_type; + // do both argTy and resTy support sugroup store/load operation + using supports_sg_loadstore = typename std::negation< + std::disjunction, is_complex>>; + + resT operator()(const argT &in) const + { + + if constexpr (std::is_integral_v) { + return in; + } + else if constexpr (is_complex::value) { + using realT = typename argT::value_type; + return resT{round_func(std::real(in)), + round_func(std::imag(in))}; + } + else { + return round_func(in); + } + } + +private: + template + T round_func(const T &input) const + { + return sycl::rint(input); + } +}; + +template +using RoundContigFunctor = + elementwise_common::UnaryContigFunctor, + vec_sz, + n_vecs, + enable_sg_loadstore>; + +template +using RoundStridedFunctor = elementwise_common:: + UnaryStridedFunctor>; + +template +struct RoundOutputType +{ + using value_type = typename std::disjunction< + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry>, + td_ns::TypeMapResultEntry>, + td_ns::DefaultResultEntry>::result_type; + + static constexpr bool is_defined = !std::is_same_v; +}; + +namespace hyperparam_detail +{ + +namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; + +using vsu_ns::ContigHyperparameterSetDefault; +using vsu_ns::UnaryContigHyperparameterSetEntry; + +template +struct RoundContigHyperparameterSet +{ + using value_type = + typename std::disjunction>; + + constexpr static auto vec_sz = value_type::vec_sz; + constexpr static auto n_vecs = value_type::n_vecs; +}; + +} // end of namespace hyperparam_detail + +template +class round_contig_kernel; + +template +sycl::event round_contig_impl(sycl::queue &exec_q, + std::size_t nelems, + const char *arg_p, + char *res_p, + const std::vector &depends = {}) +{ + using RoundHS = hyperparam_detail::RoundContigHyperparameterSet; + static constexpr std::uint8_t vec_sz = RoundHS::vec_sz; + static constexpr std::uint8_t n_vecs = RoundHS::n_vecs; + + return elementwise_common::unary_contig_impl< + argTy, RoundOutputType, RoundContigFunctor, round_contig_kernel, vec_sz, + n_vecs>(exec_q, nelems, arg_p, res_p, depends); +} + +template +struct RoundContigFactory +{ + fnT get() + { + if constexpr (!RoundOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = round_contig_impl; + return fn; + } + } +}; + +template +struct RoundTypeMapFactory +{ + /*! @brief get typeid for output type of sycl::round(T x) */ + std::enable_if_t::value, int> get() + { + using rT = typename RoundOutputType::value_type; + return td_ns::GetTypeid{}.get(); + } +}; + +template +class round_strided_kernel; + +template +sycl::event + round_strided_impl(sycl::queue &exec_q, + std::size_t nelems, + int nd, + const ssize_t *shape_and_strides, + const char *arg_p, + ssize_t arg_offset, + char *res_p, + ssize_t res_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + return elementwise_common::unary_strided_impl< + argTy, RoundOutputType, RoundStridedFunctor, round_strided_kernel>( + exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p, + res_offset, depends, additional_depends); +} + +template +struct RoundStridedFactory +{ + fnT get() + { + if constexpr (!RoundOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = round_strided_impl; + return fn; + } + } +}; + +} // namespace dpctl::tensor::kernels::round diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/rsqrt.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/rsqrt.hpp new file mode 100644 index 000000000000..aa4f1113d839 --- /dev/null +++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/rsqrt.hpp @@ -0,0 +1,206 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for elementwise evaluation of RSQRT(x) +/// function that computes the reciprocal square root. +//===---------------------------------------------------------------------===// + +#pragma once +#include +#include +#include +#include + +#include + +#include "vec_size_util.hpp" + +#include "kernels/dpctl_tensor_types.hpp" +#include "kernels/elementwise_functions/common.hpp" + +#include "utils/type_dispatch_building.hpp" + +namespace dpctl::tensor::kernels::rsqrt +{ + +using dpctl::tensor::ssize_t; +namespace td_ns = dpctl::tensor::type_dispatch; + +template +struct RsqrtFunctor +{ + + // is function constant for given argT + using is_constant = typename std::false_type; + // constant value, if constant + // constexpr resT constant_value = resT{}; + // is function defined for sycl::vec + using supports_vec = typename std::false_type; + // do both argTy and resTy support sugroup store/load operation + using supports_sg_loadstore = typename std::true_type; + + resT operator()(const argT &in) const { return sycl::rsqrt(in); } +}; + +template +using RsqrtContigFunctor = + elementwise_common::UnaryContigFunctor, + vec_sz, + n_vecs, + enable_sg_loadstore>; + +template +using RsqrtStridedFunctor = elementwise_common:: + UnaryStridedFunctor>; + +template +struct RsqrtOutputType +{ + using value_type = typename std::disjunction< + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::DefaultResultEntry>::result_type; + + static constexpr bool is_defined = !std::is_same_v; +}; + +namespace hyperparam_detail +{ + +namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; + +using vsu_ns::ContigHyperparameterSetDefault; +using vsu_ns::UnaryContigHyperparameterSetEntry; + +template +struct RsqrtContigHyperparameterSet +{ + using value_type = + typename std::disjunction>; + + constexpr static auto vec_sz = value_type::vec_sz; + constexpr static auto n_vecs = value_type::n_vecs; +}; + +} // namespace hyperparam_detail + +template +class rsqrt_contig_kernel; + +template +sycl::event rsqrt_contig_impl(sycl::queue &exec_q, + std::size_t nelems, + const char *arg_p, + char *res_p, + const std::vector &depends = {}) +{ + using RsqrtHS = hyperparam_detail::RsqrtContigHyperparameterSet; + static constexpr std::uint8_t vec_sz = RsqrtHS::vec_sz; + static constexpr std::uint8_t n_vecs = RsqrtHS::n_vecs; + + return elementwise_common::unary_contig_impl< + argTy, RsqrtOutputType, RsqrtContigFunctor, rsqrt_contig_kernel, vec_sz, + n_vecs>(exec_q, nelems, arg_p, res_p, depends); +} + +template +struct RsqrtContigFactory +{ + fnT get() + { + if constexpr (!RsqrtOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = rsqrt_contig_impl; + return fn; + } + } +}; + +template +struct RsqrtTypeMapFactory +{ + /*! @brief get typeid for output type of sycl::rsqrt(T x) */ + std::enable_if_t::value, int> get() + { + using rT = typename RsqrtOutputType::value_type; + return td_ns::GetTypeid{}.get(); + } +}; + +template +class rsqrt_strided_kernel; + +template +sycl::event + rsqrt_strided_impl(sycl::queue &exec_q, + std::size_t nelems, + int nd, + const ssize_t *shape_and_strides, + const char *arg_p, + ssize_t arg_offset, + char *res_p, + ssize_t res_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + return elementwise_common::unary_strided_impl< + argTy, RsqrtOutputType, RsqrtStridedFunctor, rsqrt_strided_kernel>( + exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p, + res_offset, depends, additional_depends); +} + +template +struct RsqrtStridedFactory +{ + fnT get() + { + if constexpr (!RsqrtOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = rsqrt_strided_impl; + return fn; + } + } +}; + +} // namespace dpctl::tensor::kernels::rsqrt diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/sign.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/sign.hpp new file mode 100644 index 000000000000..ceb3d1320f9c --- /dev/null +++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/sign.hpp @@ -0,0 +1,258 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for elementwise evaluation of SIGN(x) function. +//===---------------------------------------------------------------------===// + +#pragma once +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "cabs_impl.hpp" +#include "vec_size_util.hpp" + +#include "kernels/dpctl_tensor_types.hpp" +#include "kernels/elementwise_functions/common.hpp" + +#include "utils/type_dispatch_building.hpp" +#include "utils/type_utils.hpp" + +namespace dpctl::tensor::kernels::sign +{ + +using dpctl::tensor::ssize_t; +namespace td_ns = dpctl::tensor::type_dispatch; + +using dpctl::tensor::type_utils::is_complex; +using dpctl::tensor::type_utils::vec_cast; + +template +struct SignFunctor +{ + static_assert(std::is_same_v); + using is_constant = typename std::false_type; + // constexpr resT constant_value = resT{}; + using supports_vec = typename std::negation< + std::disjunction, is_complex>>; + using supports_sg_loadstore = std::false_type; + + resT operator()(const argT &in) const + { + if constexpr (std::is_integral_v) { + if constexpr (std::is_unsigned_v) { + return resT(0 < in); + } + else { + return sign_impl(in); + } + } + else { + if constexpr (is_complex::value) { + using realT = typename argT::value_type; + + if (in == argT(0)) { + return resT(0); + } + else { + auto z = exprm_ns::complex(in); + return (z / detail::cabs(in)); + } + } + else { + if (std::isnan(in)) { + return std::numeric_limits::quiet_NaN(); + } + else { + return sign_impl(in); + } + } + } + } + +private: + template + T sign_impl(const T &v) const + { + return (T(0) < v) - (v < T(0)); + } +}; + +template +using SignContigFunctor = + elementwise_common::UnaryContigFunctor, + vec_sz, + n_vecs, + enable_sg_loadstore>; + +template +struct SignOutputType +{ + using value_type = typename std::disjunction< + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry>, + td_ns::TypeMapResultEntry>, + td_ns::DefaultResultEntry>::result_type; + + static constexpr bool is_defined = !std::is_same_v; +}; + +namespace hyperparam_detail +{ + +namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; + +using vsu_ns::ContigHyperparameterSetDefault; +using vsu_ns::UnaryContigHyperparameterSetEntry; + +template +struct SignContigHyperparameterSet +{ + using value_type = + typename std::disjunction>; + + constexpr static auto vec_sz = value_type::vec_sz; + constexpr static auto n_vecs = value_type::n_vecs; +}; + +} // end of namespace hyperparam_detail + +template +class sign_contig_kernel; + +template +sycl::event sign_contig_impl(sycl::queue &exec_q, + std::size_t nelems, + const char *arg_p, + char *res_p, + const std::vector &depends = {}) +{ + using SignHS = hyperparam_detail::SignContigHyperparameterSet; + static constexpr std::uint8_t vec_sz = SignHS::vec_sz; + static constexpr std::uint8_t n_vecs = SignHS::n_vecs; + + return elementwise_common::unary_contig_impl< + argTy, SignOutputType, SignContigFunctor, sign_contig_kernel, vec_sz, + n_vecs>(exec_q, nelems, arg_p, res_p, depends); +} + +template +struct SignContigFactory +{ + fnT get() + { + if constexpr (!SignOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = sign_contig_impl; + return fn; + } + } +}; + +template +struct SignTypeMapFactory +{ + /*! @brief get typeid for output type of sign(T x) */ + std::enable_if_t::value, int> get() + { + using rT = typename SignOutputType::value_type; + return td_ns::GetTypeid{}.get(); + } +}; + +template +using SignStridedFunctor = elementwise_common:: + UnaryStridedFunctor>; + +template +class sign_strided_kernel; + +template +sycl::event + sign_strided_impl(sycl::queue &exec_q, + std::size_t nelems, + int nd, + const ssize_t *shape_and_strides, + const char *arg_p, + ssize_t arg_offset, + char *res_p, + ssize_t res_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + return elementwise_common::unary_strided_impl< + argTy, SignOutputType, SignStridedFunctor, sign_strided_kernel>( + exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p, + res_offset, depends, additional_depends); +} + +template +struct SignStridedFactory +{ + fnT get() + { + if constexpr (!SignOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = sign_strided_impl; + return fn; + } + } +}; + +} // namespace dpctl::tensor::kernels::sign diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/signbit.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/signbit.hpp new file mode 100644 index 000000000000..65e9e5a202a9 --- /dev/null +++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/signbit.hpp @@ -0,0 +1,220 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for elementwise evaluation of SIGNBIT(x) +/// function that tests whether the sign bit of the tensor element is set. +//===---------------------------------------------------------------------===// + +#pragma once +#include +#include +#include +#include +#include + +#include + +#include "vec_size_util.hpp" + +#include "kernels/dpctl_tensor_types.hpp" +#include "kernels/elementwise_functions/common.hpp" + +#include "utils/type_dispatch_building.hpp" +#include "utils/type_utils.hpp" + +namespace dpctl::tensor::kernels::signbit +{ + +using dpctl::tensor::ssize_t; +namespace td_ns = dpctl::tensor::type_dispatch; + +using dpctl::tensor::type_utils::is_complex; +using dpctl::tensor::type_utils::vec_cast; + +template +struct SignbitFunctor +{ + static_assert(std::is_same_v); + + using is_constant = std::false_type; + static constexpr resT constant_value = false; + using supports_vec = std::true_type; + using supports_sg_loadstore = std::true_type; + + resT operator()(const argT &in) const { return std::signbit(in); } + + template + sycl::vec operator()(const sycl::vec &in) const + { + auto const &res_vec = sycl::signbit(in); + + using deducedT = typename std::remove_cv_t< + std::remove_reference_t>::element_type; + + return vec_cast(res_vec); + } +}; + +template +using SignbitContigFunctor = + elementwise_common::UnaryContigFunctor, + vec_sz, + n_vecs, + enable_sg_loadstore>; + +template +using SignbitStridedFunctor = elementwise_common:: + UnaryStridedFunctor>; + +template +struct SignbitOutputType +{ + using value_type = typename std::disjunction< + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::DefaultResultEntry>::result_type; + + static constexpr bool is_defined = !std::is_same_v; +}; + +namespace hyperparam_detail +{ + +namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; + +using vsu_ns::ContigHyperparameterSetDefault; +using vsu_ns::UnaryContigHyperparameterSetEntry; + +template +struct SignbitContigHyperparameterSet +{ + using value_type = + typename std::disjunction>; + + constexpr static auto vec_sz = value_type::vec_sz; + constexpr static auto n_vecs = value_type::n_vecs; +}; + +} // end of namespace hyperparam_detail + +template +class signbit_contig_kernel; + +template +sycl::event signbit_contig_impl(sycl::queue &exec_q, + std::size_t nelems, + const char *arg_p, + char *res_p, + const std::vector &depends = {}) +{ + using SignbitHS = hyperparam_detail::SignbitContigHyperparameterSet; + static constexpr std::uint8_t vec_sz = SignbitHS::vec_sz; + static constexpr std::uint8_t n_vecs = SignbitHS::n_vecs; + + return elementwise_common::unary_contig_impl< + argTy, SignbitOutputType, SignbitContigFunctor, signbit_contig_kernel, + vec_sz, n_vecs>(exec_q, nelems, arg_p, res_p, depends); +} + +template +struct SignbitContigFactory +{ + fnT get() + { + if constexpr (!SignbitOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = signbit_contig_impl; + return fn; + } + } +}; + +template +struct SignbitTypeMapFactory +{ + /*! @brief get typeid for output type of sycl::isinf(T x) */ + std::enable_if_t::value, int> get() + { + using rT = typename SignbitOutputType::value_type; + return td_ns::GetTypeid{}.get(); + } +}; + +template +class signbit_strided_kernel; + +template +sycl::event + signbit_strided_impl(sycl::queue &exec_q, + std::size_t nelems, + int nd, + const ssize_t *shape_and_strides, + const char *arg_p, + ssize_t arg_offset, + char *res_p, + ssize_t res_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + return elementwise_common::unary_strided_impl( + exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p, + res_offset, depends, additional_depends); +} + +template +struct SignbitStridedFactory +{ + fnT get() + { + if constexpr (!SignbitOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = signbit_strided_impl; + return fn; + } + } +}; + +} // namespace dpctl::tensor::kernels::signbit diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/sin.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/sin.hpp new file mode 100644 index 000000000000..d1e3caa9effe --- /dev/null +++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/sin.hpp @@ -0,0 +1,333 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for elementwise evaluation of SIN(x) function. +//===---------------------------------------------------------------------===// + +#pragma once +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "sycl_complex.hpp" +#include "vec_size_util.hpp" + +#include "kernels/dpctl_tensor_types.hpp" +#include "kernels/elementwise_functions/common.hpp" + +#include "utils/type_dispatch_building.hpp" +#include "utils/type_utils.hpp" + +namespace dpctl::tensor::kernels::sin +{ + +using dpctl::tensor::ssize_t; +namespace td_ns = dpctl::tensor::type_dispatch; + +using dpctl::tensor::type_utils::is_complex; + +template +struct SinFunctor +{ + // is function constant for given argT + using is_constant = typename std::false_type; + // constant value, if constant + // constexpr resT constant_value = resT{}; + // is function defined for sycl::vec + using supports_vec = typename std::false_type; + // do both argTy and resTy support sugroup store/load operation + using supports_sg_loadstore = typename std::negation< + std::disjunction, is_complex>>; + + resT operator()(const argT &in) const + { + if constexpr (is_complex::value) { + using realT = typename argT::value_type; + + static constexpr realT q_nan = + std::numeric_limits::quiet_NaN(); + + realT const &in_re = std::real(in); + realT const &in_im = std::imag(in); + + const bool in_re_finite = std::isfinite(in_re); + const bool in_im_finite = std::isfinite(in_im); + /* + * Handle the nearly-non-exceptional cases where + * real and imaginary parts of input are finite. + */ + if (in_re_finite && in_im_finite) { + resT res = + exprm_ns::sin(exprm_ns::complex(in)); // sin(in); + if (in_re == realT(0)) { + res.real(sycl::copysign(realT(0), in_re)); + } + return res; + } + + /* + * since sin(in) = -I * sinh(I * in), for special cases, + * we calculate real and imaginary parts of z = sinh(I * in) and + * then return { imag(z) , -real(z) } which is sin(in). + */ + const realT x = -in_im; + const realT y = in_re; + const bool xfinite = in_im_finite; + const bool yfinite = in_re_finite; + /* + * sinh(+-0 +- I Inf) = sign(d(+-0, dNaN))0 + I dNaN. + * The sign of 0 in the result is unspecified. Choice = normally + * the same as dNaN. + * + * sinh(+-0 +- I NaN) = sign(d(+-0, NaN))0 + I d(NaN). + * The sign of 0 in the result is unspecified. Choice = normally + * the same as d(NaN). + */ + if (x == realT(0) && !yfinite) { + const realT sinh_im = q_nan; + const realT sinh_re = sycl::copysign(realT(0), x * sinh_im); + return resT{sinh_im, -sinh_re}; + } + + /* + * sinh(+-Inf +- I 0) = +-Inf + I +-0. + * + * sinh(NaN +- I 0) = d(NaN) + I +-0. + */ + if (y == realT(0) && !xfinite) { + if (std::isnan(x)) { + const realT sinh_re = x; + const realT sinh_im = y; + return resT{sinh_im, -sinh_re}; + } + const realT sinh_re = x; + const realT sinh_im = sycl::copysign(realT(0), y); + return resT{sinh_im, -sinh_re}; + } + + /* + * sinh(x +- I Inf) = dNaN + I dNaN. + * + * sinh(x + I NaN) = d(NaN) + I d(NaN). + */ + if (xfinite && !yfinite) { + const realT sinh_re = q_nan; + const realT sinh_im = x * sinh_re; + return resT{sinh_im, -sinh_re}; + } + + /* + * sinh(+-Inf + I NaN) = +-Inf + I d(NaN). + * The sign of Inf in the result is unspecified. Choice = normally + * the same as d(NaN). + * + * sinh(+-Inf +- I Inf) = +Inf + I dNaN. + * The sign of Inf in the result is unspecified. + * Choice = always - here for sinh to have positive result for + * imaginary part of sin. + * + * sinh(+-Inf + I y) = +-Inf cos(y) + I Inf sin(y) + */ + if (std::isinf(x)) { + if (!yfinite) { + const realT sinh_re = -x * x; + const realT sinh_im = x * (y - y); + return resT{sinh_im, -sinh_re}; + } + const realT sinh_re = x * sycl::cos(y); + const realT sinh_im = + std::numeric_limits::infinity() * sycl::sin(y); + return resT{sinh_im, -sinh_re}; + } + + /* + * sinh(NaN + I NaN) = d(NaN) + I d(NaN). + * + * sinh(NaN +- I Inf) = d(NaN) + I d(NaN). + * + * sinh(NaN + I y) = d(NaN) + I d(NaN). + */ + const realT y_m_y = (y - y); + const realT sinh_re = (x * x) * y_m_y; + const realT sinh_im = (x + x) * y_m_y; + return resT{sinh_im, -sinh_re}; + } + else { + static_assert(std::is_same_v); + if (in == 0) { + return in; + } + return sycl::sin(in); + } + } +}; + +template +using SinContigFunctor = + elementwise_common::UnaryContigFunctor, + vec_sz, + n_vecs, + enable_sg_loadstore>; + +template +using SinStridedFunctor = elementwise_common:: + UnaryStridedFunctor>; + +template +struct SinOutputType +{ + using value_type = typename std::disjunction< + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry>, + td_ns::TypeMapResultEntry>, + td_ns::DefaultResultEntry>::result_type; + + static constexpr bool is_defined = !std::is_same_v; +}; + +namespace hyperparam_detail +{ + +namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; + +using vsu_ns::ContigHyperparameterSetDefault; +using vsu_ns::UnaryContigHyperparameterSetEntry; + +template +struct SinContigHyperparameterSet +{ + using value_type = + typename std::disjunction>; + + constexpr static auto vec_sz = value_type::vec_sz; + constexpr static auto n_vecs = value_type::n_vecs; +}; + +} // end of namespace hyperparam_detail + +template +class sin_contig_kernel; + +template +sycl::event sin_contig_impl(sycl::queue &exec_q, + std::size_t nelems, + const char *arg_p, + char *res_p, + const std::vector &depends = {}) +{ + using SinHS = hyperparam_detail::SinContigHyperparameterSet; + static constexpr std::uint8_t vec_sz = SinHS::vec_sz; + static constexpr std::uint8_t n_vecs = SinHS::n_vecs; + + return elementwise_common::unary_contig_impl< + argTy, SinOutputType, SinContigFunctor, sin_contig_kernel, vec_sz, + n_vecs>(exec_q, nelems, arg_p, res_p, depends); +} + +template +struct SinContigFactory +{ + fnT get() + { + if constexpr (!SinOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = sin_contig_impl; + return fn; + } + } +}; + +template +struct SinTypeMapFactory +{ + /*! @brief get typeid for output type of sycl::sin(T x) */ + std::enable_if_t::value, int> get() + { + using rT = typename SinOutputType::value_type; + return td_ns::GetTypeid{}.get(); + } +}; + +template +class sin_strided_kernel; + +template +sycl::event sin_strided_impl(sycl::queue &exec_q, + std::size_t nelems, + int nd, + const ssize_t *shape_and_strides, + const char *arg_p, + ssize_t arg_offset, + char *res_p, + ssize_t res_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + return elementwise_common::unary_strided_impl< + argTy, SinOutputType, SinStridedFunctor, sin_strided_kernel>( + exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p, + res_offset, depends, additional_depends); +} + +template +struct SinStridedFactory +{ + fnT get() + { + if constexpr (!SinOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = sin_strided_impl; + return fn; + } + } +}; + +} // namespace dpctl::tensor::kernels::sin diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/sinh.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/sinh.hpp new file mode 100644 index 000000000000..f81a2730fd17 --- /dev/null +++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/sinh.hpp @@ -0,0 +1,302 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for elementwise evaluation of SINH(x) function. +//===---------------------------------------------------------------------===// + +#pragma once +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "sycl_complex.hpp" +#include "vec_size_util.hpp" + +#include "kernels/dpctl_tensor_types.hpp" +#include "kernels/elementwise_functions/common.hpp" + +#include "utils/type_dispatch_building.hpp" +#include "utils/type_utils.hpp" + +namespace dpctl::tensor::kernels::sinh +{ + +using dpctl::tensor::ssize_t; +namespace td_ns = dpctl::tensor::type_dispatch; + +using dpctl::tensor::type_utils::is_complex; + +template +struct SinhFunctor +{ + + // is function constant for given argT + using is_constant = typename std::false_type; + // constant value, if constant + // constexpr resT constant_value = resT{}; + // is function defined for sycl::vec + using supports_vec = typename std::false_type; + // do both argTy and resTy support sugroup store/load operation + using supports_sg_loadstore = typename std::negation< + std::disjunction, is_complex>>; + + resT operator()(const argT &in) const + { + if constexpr (is_complex::value) { + using realT = typename argT::value_type; + + const realT x = std::real(in); + const realT y = std::imag(in); + + const bool xfinite = std::isfinite(x); + const bool yfinite = std::isfinite(y); + + /* + * Handle the nearly-non-exceptional cases where + * real and imaginary parts of input are finite. + */ + if (xfinite && yfinite) { + return exprm_ns::sinh(exprm_ns::complex(in)); + } + /* + * sinh(+-0 +- I Inf) = sign(d(+-0, dNaN))0 + I dNaN. + * The sign of 0 in the result is unspecified. Choice = normally + * the same as dNaN. + * + * sinh(+-0 +- I NaN) = sign(d(+-0, NaN))0 + I d(NaN). + * The sign of 0 in the result is unspecified. Choice = normally + * the same as d(NaN). + */ + if (x == realT(0) && !yfinite) { + const realT res_re = sycl::copysign(realT(0), x * (y - y)); + return resT{res_re, y - y}; + } + + /* + * sinh(+-Inf +- I 0) = +-Inf + I +-0. + * + * sinh(NaN +- I 0) = d(NaN) + I +-0. + */ + if (y == realT(0) && !xfinite) { + if (std::isnan(x)) { + return resT{x, y}; + } + const realT res_im = sycl::copysign(realT(0), y); + return resT{x, res_im}; + } + + /* + * sinh(x +- I Inf) = dNaN + I dNaN. + * + * sinh(x + I NaN) = d(NaN) + I d(NaN). + */ + if (xfinite && !yfinite) { + return resT{y - y, x * (y - y)}; + } + + /* + * sinh(+-Inf + I NaN) = +-Inf + I d(NaN). + * The sign of Inf in the result is unspecified. Choice = normally + * the same as d(NaN). + * + * sinh(+-Inf +- I Inf) = +Inf + I dNaN. + * The sign of Inf in the result is unspecified. Choice = always +. + * + * sinh(+-Inf + I y) = +-Inf cos(y) + I Inf sin(y) + */ + if (!xfinite && !std::isnan(x)) { + if (!yfinite) { + return resT{x * x, x * (y - y)}; + } + return resT{x * sycl::cos(y), + std::numeric_limits::infinity() * + sycl::sin(y)}; + } + + /* + * sinh(NaN + I NaN) = d(NaN) + I d(NaN). + * + * sinh(NaN +- I Inf) = d(NaN) + I d(NaN). + * + * sinh(NaN + I y) = d(NaN) + I d(NaN). + */ + return resT{(x * x) * (y - y), (x + x) * (y - y)}; + } + else { + static_assert(std::is_floating_point_v || + std::is_same_v); + return sycl::sinh(in); + } + } +}; + +template +using SinhContigFunctor = + elementwise_common::UnaryContigFunctor, + vec_sz, + n_vecs, + enable_sg_loadstore>; + +template +using SinhStridedFunctor = elementwise_common:: + UnaryStridedFunctor>; + +template +struct SinhOutputType +{ + using value_type = typename std::disjunction< + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry>, + td_ns::TypeMapResultEntry>, + td_ns::DefaultResultEntry>::result_type; + + static constexpr bool is_defined = !std::is_same_v; +}; + +namespace hyperparam_detail +{ + +namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; + +using vsu_ns::ContigHyperparameterSetDefault; +using vsu_ns::UnaryContigHyperparameterSetEntry; + +template +struct SinhContigHyperparameterSet +{ + using value_type = + typename std::disjunction>; + + constexpr static auto vec_sz = value_type::vec_sz; + constexpr static auto n_vecs = value_type::n_vecs; +}; + +} // end of namespace hyperparam_detail + +template +class sinh_contig_kernel; + +template +sycl::event sinh_contig_impl(sycl::queue &exec_q, + std::size_t nelems, + const char *arg_p, + char *res_p, + const std::vector &depends = {}) +{ + using SinhHS = hyperparam_detail::SinhContigHyperparameterSet; + static constexpr std::uint8_t vec_sz = SinhHS::vec_sz; + static constexpr std::uint8_t n_vecs = SinhHS::n_vecs; + + return elementwise_common::unary_contig_impl< + argTy, SinhOutputType, SinhContigFunctor, sinh_contig_kernel, vec_sz, + n_vecs>(exec_q, nelems, arg_p, res_p, depends); +} + +template +struct SinhContigFactory +{ + fnT get() + { + if constexpr (!SinhOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = sinh_contig_impl; + return fn; + } + } +}; + +template +struct SinhTypeMapFactory +{ + /*! @brief get typeid for output type of sycl::sinh(T x) */ + std::enable_if_t::value, int> get() + { + using rT = typename SinhOutputType::value_type; + return td_ns::GetTypeid{}.get(); + } +}; + +template +class sinh_strided_kernel; + +template +sycl::event + sinh_strided_impl(sycl::queue &exec_q, + std::size_t nelems, + int nd, + const ssize_t *shape_and_strides, + const char *arg_p, + ssize_t arg_offset, + char *res_p, + ssize_t res_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + return elementwise_common::unary_strided_impl< + argTy, SinhOutputType, SinhStridedFunctor, sinh_strided_kernel>( + exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p, + res_offset, depends, additional_depends); +} + +template +struct SinhStridedFactory +{ + fnT get() + { + if constexpr (!SinhOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = sinh_strided_impl; + return fn; + } + } +}; + +} // namespace dpctl::tensor::kernels::sinh diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/sqrt.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/sqrt.hpp new file mode 100644 index 000000000000..08b3b092d1ca --- /dev/null +++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/sqrt.hpp @@ -0,0 +1,224 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for elementwise evaluation of SQRT(x) +/// function that computes a square root. +//===---------------------------------------------------------------------===// + +#pragma once +#include +#include +#include +#include +#include + +#include + +#include "sycl_complex.hpp" +#include "vec_size_util.hpp" + +#include "kernels/dpctl_tensor_types.hpp" +#include "kernels/elementwise_functions/common.hpp" + +#include "utils/type_dispatch_building.hpp" +#include "utils/type_utils.hpp" + +namespace dpctl::tensor::kernels::sqrt +{ + +using dpctl::tensor::ssize_t; +namespace td_ns = dpctl::tensor::type_dispatch; + +using dpctl::tensor::type_utils::is_complex; + +template +struct SqrtFunctor +{ + + // is function constant for given argT + using is_constant = typename std::false_type; + // constant value, if constant + // constexpr resT constant_value = resT{}; + // is function defined for sycl::vec + using supports_vec = typename std::false_type; + // do both argTy and resTy support sugroup store/load operation + using supports_sg_loadstore = typename std::negation< + std::disjunction, is_complex>>; + + resT operator()(const argT &in) const + { + if constexpr (is_complex::value) { + using realT = typename argT::value_type; + return exprm_ns::sqrt(exprm_ns::complex(in)); + } + else { + return sycl::sqrt(in); + } + } +}; + +template +using SqrtContigFunctor = + elementwise_common::UnaryContigFunctor, + vec_sz, + n_vecs, + enable_sg_loadstore>; + +template +using SqrtStridedFunctor = elementwise_common:: + UnaryStridedFunctor>; + +template +struct SqrtOutputType +{ + using value_type = typename std::disjunction< + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, std::complex>, + td_ns:: + TypeMapResultEntry, std::complex>, + td_ns::DefaultResultEntry>::result_type; + + static constexpr bool is_defined = !std::is_same_v; +}; + +namespace hyperparam_detail +{ + +namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; + +using vsu_ns::ContigHyperparameterSetDefault; +using vsu_ns::UnaryContigHyperparameterSetEntry; + +template +struct SqrtContigHyperparameterSet +{ + using value_type = + typename std::disjunction>; + + constexpr static auto vec_sz = value_type::vec_sz; + constexpr static auto n_vecs = value_type::n_vecs; +}; + +} // end of namespace hyperparam_detail + +template +class sqrt_contig_kernel; + +template +sycl::event sqrt_contig_impl(sycl::queue &exec_q, + std::size_t nelems, + const char *arg_p, + char *res_p, + const std::vector &depends = {}) +{ + using SqrtHS = hyperparam_detail::SqrtContigHyperparameterSet; + static constexpr std::uint8_t vec_sz = SqrtHS::vec_sz; + static constexpr std::uint8_t n_vecs = SqrtHS::n_vecs; + + return elementwise_common::unary_contig_impl< + argTy, SqrtOutputType, SqrtContigFunctor, sqrt_contig_kernel, vec_sz, + n_vecs>(exec_q, nelems, arg_p, res_p, depends); +} + +template +struct SqrtContigFactory +{ + fnT get() + { + if constexpr (!SqrtOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = sqrt_contig_impl; + return fn; + } + } +}; + +template +struct SqrtTypeMapFactory +{ + /*! @brief get typeid for output type of std::sqrt(T x) */ + std::enable_if_t::value, int> get() + { + using rT = typename SqrtOutputType::value_type; + return td_ns::GetTypeid{}.get(); + } +}; + +template +class sqrt_strided_kernel; + +template +sycl::event + sqrt_strided_impl(sycl::queue &exec_q, + std::size_t nelems, + int nd, + const ssize_t *shape_and_strides, + const char *arg_p, + ssize_t arg_offset, + char *res_p, + ssize_t res_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + return elementwise_common::unary_strided_impl< + argTy, SqrtOutputType, SqrtStridedFunctor, sqrt_strided_kernel>( + exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p, + res_offset, depends, additional_depends); +} + +template +struct SqrtStridedFactory +{ + fnT get() + { + if constexpr (!SqrtOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = sqrt_strided_impl; + return fn; + } + } +}; + +} // namespace dpctl::tensor::kernels::sqrt diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/square.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/square.hpp new file mode 100644 index 000000000000..de3007acfbea --- /dev/null +++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/square.hpp @@ -0,0 +1,251 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for elementwise evaluation of SQUARE(x) +/// +//===---------------------------------------------------------------------===// + +#pragma once +#include +#include +#include +#include +#include + +#include + +#include "sycl_complex.hpp" +#include "vec_size_util.hpp" + +#include "kernels/dpctl_tensor_types.hpp" +#include "kernels/elementwise_functions/common.hpp" + +#include "utils/type_dispatch_building.hpp" +#include "utils/type_utils.hpp" + +namespace dpctl::tensor::kernels::square +{ + +using dpctl::tensor::ssize_t; +namespace td_ns = dpctl::tensor::type_dispatch; + +using dpctl::tensor::type_utils::is_complex; +using dpctl::tensor::type_utils::vec_cast; + +template +struct SquareFunctor +{ + + // is function constant for given argT + using is_constant = typename std::false_type; + // constant value, if constant + // constexpr resT constant_value = resT{}; + // is function defined for sycl::vec + using supports_vec = typename std::negation< + std::disjunction, is_complex>>; + // do both argTy and resTy support sugroup store/load operation + using supports_sg_loadstore = typename std::negation< + std::disjunction, is_complex>>; + + resT operator()(const argT &in) const + { + if constexpr (is_complex::value) { + using realT = typename argT::value_type; + + auto z = exprm_ns::complex(in); + + return z * z; + } + else { + return in * in; + } + } + + template + sycl::vec operator()(const sycl::vec &in) const + { + auto const &res_vec = in * in; + using deducedT = typename std::remove_cv_t< + std::remove_reference_t>::element_type; + if constexpr (std::is_same_v) { + return res_vec; + } + else { + return vec_cast(res_vec); + } + } +}; + +template +using SquareContigFunctor = + elementwise_common::UnaryContigFunctor, + vec_sz, + n_vecs, + enable_sg_loadstore>; + +template +using SquareStridedFunctor = elementwise_common:: + UnaryStridedFunctor>; + +template +struct SquareOutputType +{ + using value_type = typename std::disjunction< + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry>, + td_ns::TypeMapResultEntry>, + td_ns::DefaultResultEntry>::result_type; + + static constexpr bool is_defined = !std::is_same_v; +}; + +namespace hyperparam_detail +{ + +namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; + +using vsu_ns::ContigHyperparameterSetDefault; +using vsu_ns::UnaryContigHyperparameterSetEntry; + +template +struct SquareContigHyperparameterSet +{ + using value_type = + typename std::disjunction>; + + constexpr static auto vec_sz = value_type::vec_sz; + constexpr static auto n_vecs = value_type::n_vecs; +}; + +} // end of namespace hyperparam_detail + +template +class square_contig_kernel; + +template +sycl::event square_contig_impl(sycl::queue &exec_q, + std::size_t nelems, + const char *arg_p, + char *res_p, + const std::vector &depends = {}) +{ + using SquareHS = hyperparam_detail::SquareContigHyperparameterSet; + static constexpr std::uint8_t vec_sz = SquareHS::vec_sz; + static constexpr std::uint8_t n_vecs = SquareHS::n_vecs; + + return elementwise_common::unary_contig_impl< + argTy, SquareOutputType, SquareContigFunctor, square_contig_kernel, + vec_sz, n_vecs>(exec_q, nelems, arg_p, res_p, depends); +} + +template +struct SquareContigFactory +{ + fnT get() + { + if constexpr (!SquareOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = square_contig_impl; + return fn; + } + } +}; + +template +struct SquareTypeMapFactory +{ + /*! @brief get typeid for output type of x * x */ + std::enable_if_t::value, int> get() + { + using rT = typename SquareOutputType::value_type; + return td_ns::GetTypeid{}.get(); + } +}; + +template +class square_strided_kernel; + +template +sycl::event + square_strided_impl(sycl::queue &exec_q, + std::size_t nelems, + int nd, + const ssize_t *shape_and_strides, + const char *arg_p, + ssize_t arg_offset, + char *res_p, + ssize_t res_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + return elementwise_common::unary_strided_impl< + argTy, SquareOutputType, SquareStridedFunctor, square_strided_kernel>( + exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p, + res_offset, depends, additional_depends); +} + +template +struct SquareStridedFactory +{ + fnT get() + { + if constexpr (!SquareOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = square_strided_impl; + return fn; + } + } +}; + +} // namespace dpctl::tensor::kernels::square diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/subtract.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/subtract.hpp new file mode 100644 index 000000000000..431596594ad3 --- /dev/null +++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/subtract.hpp @@ -0,0 +1,640 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for elementwise evaluation of SUBTRACT(x1, x2) +/// function. +//===---------------------------------------------------------------------===// + +#pragma once +#include +#include +#include +#include +#include +#include + +#include "vec_size_util.hpp" + +#include "utils/offset_utils.hpp" +#include "utils/type_dispatch_building.hpp" +#include "utils/type_utils.hpp" + +#include "kernels/dpctl_tensor_types.hpp" +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/common_inplace.hpp" + +namespace dpctl::tensor::kernels::subtract +{ + +using dpctl::tensor::ssize_t; +namespace td_ns = dpctl::tensor::type_dispatch; +namespace tu_ns = dpctl::tensor::type_utils; + +template +struct SubtractFunctor +{ + + using supports_sg_loadstore = std::negation< + std::disjunction, tu_ns::is_complex>>; + using supports_vec = std::negation< + std::disjunction, tu_ns::is_complex>>; + + resT operator()(const argT1 &in1, const argT2 &in2) const + { + return in1 - in2; + } + + template + sycl::vec + operator()(const sycl::vec &in1, + const sycl::vec &in2) const + { + auto tmp = in1 - in2; + if constexpr (std::is_same_v) { + return tmp; + } + else { + using dpctl::tensor::type_utils::vec_cast; + + return vec_cast( + tmp); + } + } +}; + +template +using SubtractContigFunctor = + elementwise_common::BinaryContigFunctor, + vec_sz, + n_vecs, + enable_sg_loadstore>; + +template +using SubtractStridedFunctor = elementwise_common::BinaryStridedFunctor< + argT1, + argT2, + resT, + IndexerT, + SubtractFunctor>; + +template +struct SubtractOutputType +{ + using value_type = typename std::disjunction< + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + T2, + std::complex, + std::complex>, + td_ns::BinaryTypeMapResultEntry, + T2, + std::complex, + std::complex>, + td_ns::DefaultResultEntry>::result_type; + + static constexpr bool is_defined = !std::is_same_v; +}; + +namespace hyperparam_detail +{ + +namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; + +using vsu_ns::BinaryContigHyperparameterSetEntry; +using vsu_ns::ContigHyperparameterSetDefault; + +template +struct SubtractContigHyperparameterSet +{ + using value_type = + typename std::disjunction>; + + constexpr static auto vec_sz = value_type::vec_sz; + constexpr static auto n_vecs = value_type::n_vecs; +}; + +} // end of namespace hyperparam_detail + +template +class subtract_contig_kernel; + +template +sycl::event subtract_contig_impl(sycl::queue &exec_q, + std::size_t nelems, + const char *arg1_p, + ssize_t arg1_offset, + const char *arg2_p, + ssize_t arg2_offset, + char *res_p, + ssize_t res_offset, + const std::vector &depends = {}) +{ + using SubHS = + hyperparam_detail::SubtractContigHyperparameterSet; + static constexpr std::uint8_t vec_sz = SubHS::vec_sz; + static constexpr std::uint8_t n_vecs = SubHS::n_vecs; + + return elementwise_common::binary_contig_impl< + argTy1, argTy2, SubtractOutputType, SubtractContigFunctor, + subtract_contig_kernel, vec_sz, n_vecs>( + exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p, + res_offset, depends); +} + +template +struct SubtractContigFactory +{ + fnT get() + { + if constexpr (!SubtractOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = subtract_contig_impl; + return fn; + } + } +}; + +template +struct SubtractTypeMapFactory +{ + /*! @brief get typeid for output type of divide(T1 x, T2 y) */ + std::enable_if_t::value, int> get() + { + using rT = typename SubtractOutputType::value_type; + return td_ns::GetTypeid{}.get(); + } +}; + +template +class subtract_strided_kernel; + +template +sycl::event + subtract_strided_impl(sycl::queue &exec_q, + std::size_t nelems, + int nd, + const ssize_t *shape_and_strides, + const char *arg1_p, + ssize_t arg1_offset, + const char *arg2_p, + ssize_t arg2_offset, + char *res_p, + ssize_t res_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + return elementwise_common::binary_strided_impl< + argTy1, argTy2, SubtractOutputType, SubtractStridedFunctor, + subtract_strided_kernel>(exec_q, nelems, nd, shape_and_strides, arg1_p, + arg1_offset, arg2_p, arg2_offset, res_p, + res_offset, depends, additional_depends); +} + +template +struct SubtractStridedFactory +{ + fnT get() + { + if constexpr (!SubtractOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = subtract_strided_impl; + return fn; + } + } +}; + +template +using SubtractContigMatrixContigRowBroadcastingFunctor = + elementwise_common::BinaryContigMatrixContigRowBroadcastingFunctor< + argT1, + argT2, + resT, + SubtractFunctor>; + +template +using SubtractContigRowContigMatrixBroadcastingFunctor = + elementwise_common::BinaryContigRowContigMatrixBroadcastingFunctor< + argT1, + argT2, + resT, + SubtractFunctor>; + +template +class subtract_matrix_row_broadcast_sg_krn; + +template +class subtract_row_matrix_broadcast_sg_krn; + +template +sycl::event subtract_contig_matrix_contig_row_broadcast_impl( + sycl::queue &exec_q, + std::vector &host_tasks, + std::size_t n0, + std::size_t n1, + const char *mat_p, // typeless pointer to (n0, n1) C-contiguous matrix + ssize_t mat_offset, + const char *vec_p, // typeless pointer to (n1,) contiguous row + ssize_t vec_offset, + char *res_p, // typeless pointer to (n0, n1) result C-contig. matrix, + // res[i,j] = mat[i,j] - vec[j] + ssize_t res_offset, + const std::vector &depends = {}) +{ + return elementwise_common::binary_contig_matrix_contig_row_broadcast_impl< + argT1, argT2, resT, SubtractContigMatrixContigRowBroadcastingFunctor, + subtract_matrix_row_broadcast_sg_krn>(exec_q, host_tasks, n0, n1, mat_p, + mat_offset, vec_p, vec_offset, + res_p, res_offset, depends); +} + +template +struct SubtractContigMatrixContigRowBroadcastFactory +{ + fnT get() + { + if constexpr (!SubtractOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + using resT = typename SubtractOutputType::value_type; + if constexpr (dpctl::tensor::type_utils::is_complex::value || + dpctl::tensor::type_utils::is_complex::value || + dpctl::tensor::type_utils::is_complex::value) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = + subtract_contig_matrix_contig_row_broadcast_impl; + return fn; + } + } + } +}; + +template +sycl::event subtract_contig_row_contig_matrix_broadcast_impl( + sycl::queue &exec_q, + std::vector &host_tasks, + std::size_t n0, + std::size_t n1, + const char *vec_p, // typeless pointer to (n1,) contiguous row + ssize_t vec_offset, + const char *mat_p, // typeless pointer to (n0, n1) C-contiguous matrix + ssize_t mat_offset, + char *res_p, // typeless pointer to (n0, n1) result C-contig. matrix, + // res[i,j] = op(vec[j], mat[i,j]) + ssize_t res_offset, + const std::vector &depends = {}) +{ + return elementwise_common::binary_contig_row_contig_matrix_broadcast_impl< + argT1, argT2, resT, SubtractContigRowContigMatrixBroadcastingFunctor, + subtract_row_matrix_broadcast_sg_krn>(exec_q, host_tasks, n0, n1, vec_p, + vec_offset, mat_p, mat_offset, + res_p, res_offset, depends); +} + +template +struct SubtractContigRowContigMatrixBroadcastFactory +{ + fnT get() + { + if constexpr (!SubtractOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + using resT = typename SubtractOutputType::value_type; + if constexpr (dpctl::tensor::type_utils::is_complex::value || + dpctl::tensor::type_utils::is_complex::value || + dpctl::tensor::type_utils::is_complex::value) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = + subtract_contig_row_contig_matrix_broadcast_impl; + return fn; + } + } + } +}; + +template +struct SubtractInplaceFunctor +{ + + using supports_sg_loadstore = std::negation< + std::disjunction, tu_ns::is_complex>>; + using supports_vec = std::negation< + std::disjunction, tu_ns::is_complex>>; + + void operator()(resT &res, const argT &in) { res -= in; } + + template + void operator()(sycl::vec &res, + const sycl::vec &in) + { + res -= in; + } +}; + +template +using SubtractInplaceContigFunctor = + elementwise_common::BinaryInplaceContigFunctor< + argT, + resT, + SubtractInplaceFunctor, + vec_sz, + n_vecs, + enable_sg_loadstore>; + +template +using SubtractInplaceStridedFunctor = + elementwise_common::BinaryInplaceStridedFunctor< + argT, + resT, + IndexerT, + SubtractInplaceFunctor>; + +template +class subtract_inplace_contig_kernel; + +/* @brief Types supported by in-place subtraction */ +template +struct SubtractInplaceTypePairSupport +{ + /* value if true a kernel for must be instantiated */ + static constexpr bool is_defined = std::disjunction< + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + resTy, + std::complex>, + td_ns::TypePairDefinedEntry, + resTy, + std::complex>, + // fall-through + td_ns::NotDefinedEntry>::is_defined; +}; + +template +struct SubtractInplaceTypeMapFactory +{ + /*! @brief get typeid for output type of x -= y */ + std::enable_if_t::value, int> get() + { + if constexpr (SubtractInplaceTypePairSupport::is_defined) { + return td_ns::GetTypeid{}.get(); + } + else { + return td_ns::GetTypeid{}.get(); + } + } +}; + +template +sycl::event + subtract_inplace_contig_impl(sycl::queue &exec_q, + std::size_t nelems, + const char *arg_p, + ssize_t arg_offset, + char *res_p, + ssize_t res_offset, + const std::vector &depends = {}) +{ + using SubHS = + hyperparam_detail::SubtractContigHyperparameterSet; + static constexpr std::uint8_t vec_sz = SubHS::vec_sz; + static constexpr std::uint8_t n_vecs = SubHS::n_vecs; + + return elementwise_common::binary_inplace_contig_impl< + argTy, resTy, SubtractInplaceContigFunctor, + subtract_inplace_contig_kernel, vec_sz, n_vecs>( + exec_q, nelems, arg_p, arg_offset, res_p, res_offset, depends); +} + +template +struct SubtractInplaceContigFactory +{ + fnT get() + { + if constexpr (!SubtractInplaceTypePairSupport::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = subtract_inplace_contig_impl; + return fn; + } + } +}; + +template +class subtract_inplace_strided_kernel; + +template +sycl::event subtract_inplace_strided_impl( + sycl::queue &exec_q, + std::size_t nelems, + int nd, + const ssize_t *shape_and_strides, + const char *arg_p, + ssize_t arg_offset, + char *res_p, + ssize_t res_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + return elementwise_common::binary_inplace_strided_impl< + argTy, resTy, SubtractInplaceStridedFunctor, + subtract_inplace_strided_kernel>(exec_q, nelems, nd, shape_and_strides, + arg_p, arg_offset, res_p, res_offset, + depends, additional_depends); +} + +template +struct SubtractInplaceStridedFactory +{ + fnT get() + { + if constexpr (!SubtractInplaceTypePairSupport::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = subtract_inplace_strided_impl; + return fn; + } + } +}; + +template +class subtract_inplace_row_matrix_broadcast_sg_krn; + +template +using SubtractInplaceRowMatrixBroadcastingFunctor = + elementwise_common::BinaryInplaceRowMatrixBroadcastingFunctor< + argT, + resT, + SubtractInplaceFunctor>; + +template +sycl::event subtract_inplace_row_matrix_broadcast_impl( + sycl::queue &exec_q, + std::vector &host_tasks, + std::size_t n0, + std::size_t n1, + const char *vec_p, // typeless pointer to (n1,) contiguous row + ssize_t vec_offset, + char *mat_p, // typeless pointer to (n0, n1) C-contiguous matrix + ssize_t mat_offset, + const std::vector &depends = {}) +{ + return elementwise_common::binary_inplace_row_matrix_broadcast_impl< + argT, resT, SubtractInplaceRowMatrixBroadcastingFunctor, + subtract_inplace_row_matrix_broadcast_sg_krn>( + exec_q, host_tasks, n0, n1, vec_p, vec_offset, mat_p, mat_offset, + depends); +} + +template +struct SubtractInplaceRowMatrixBroadcastFactory +{ + fnT get() + { + if constexpr (!SubtractInplaceTypePairSupport::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + if constexpr (dpctl::tensor::type_utils::is_complex::value || + dpctl::tensor::type_utils::is_complex::value) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = subtract_inplace_row_matrix_broadcast_impl; + return fn; + } + } + } +}; + +} // namespace dpctl::tensor::kernels::subtract diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/sycl_complex.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/sycl_complex.hpp new file mode 100644 index 000000000000..5cadec6ce2a4 --- /dev/null +++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/sycl_complex.hpp @@ -0,0 +1,44 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +/// +/// \file +/// This file defines a macro for defining the SYCL_EXT_ONEAPI_COMPLEX macro +/// and indirect inclusion of the experimental oneAPI SYCL complex extension +/// header file. +//===---------------------------------------------------------------------===// + +#pragma once + +#define SYCL_EXT_ONEAPI_COMPLEX +#if __has_include() +#include +#else +#include +#endif + +namespace exprm_ns = sycl::ext::oneapi::experimental; diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/tan.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/tan.hpp new file mode 100644 index 000000000000..2db2a6b5fbf8 --- /dev/null +++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/tan.hpp @@ -0,0 +1,276 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for elementwise evaluation of TAN(x) function. +//===---------------------------------------------------------------------===// + +#pragma once +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "sycl_complex.hpp" +#include "vec_size_util.hpp" + +#include "kernels/dpctl_tensor_types.hpp" +#include "kernels/elementwise_functions/common.hpp" + +#include "utils/type_dispatch_building.hpp" +#include "utils/type_utils.hpp" + +namespace dpctl::tensor::kernels::tan +{ + +using dpctl::tensor::ssize_t; +namespace td_ns = dpctl::tensor::type_dispatch; + +using dpctl::tensor::type_utils::is_complex; + +template +struct TanFunctor +{ + + // is function constant for given argT + using is_constant = typename std::false_type; + // constant value, if constant + // constexpr resT constant_value = resT{}; + // is function defined for sycl::vec + using supports_vec = typename std::false_type; + // do both argTy and resTy support sugroup store/load operation + using supports_sg_loadstore = typename std::negation< + std::disjunction, is_complex>>; + + resT operator()(const argT &in) const + { + if constexpr (is_complex::value) { + + using realT = typename argT::value_type; + + static constexpr realT q_nan = + std::numeric_limits::quiet_NaN(); + /* + * since tan(in) = -I * tanh(I * in), for special cases, + * we calculate real and imaginary parts of z = tanh(I * in) and + * return { imag(z) , -real(z) } which is tan(in). + */ + const realT x = -std::imag(in); + const realT y = std::real(in); + /* + * tanh(NaN + i 0) = NaN + i 0 + * + * tanh(NaN + i y) = NaN + i NaN for y != 0 + * + * The imaginary part has the sign of x*sin(2*y), but there's no + * special effort to get this right. + * + * tanh(+-Inf +- i Inf) = +-1 +- 0 + * + * tanh(+-Inf + i y) = +-1 + 0 sin(2y) for y finite + * + * The imaginary part of the sign is unspecified. This special + * case is only needed to avoid a spurious invalid exception when + * y is infinite. + */ + if (!std::isfinite(x)) { + if (std::isnan(x)) { + const realT tanh_re = x; + const realT tanh_im = (y == realT(0) ? y : x * y); + return resT{tanh_im, -tanh_re}; + } + const realT tanh_re = sycl::copysign(realT(1), x); + const realT tanh_im = sycl::copysign( + realT(0), std::isinf(y) ? y : sycl::sin(y) * sycl::cos(y)); + return resT{tanh_im, -tanh_re}; + } + /* + * tanh(x + i NAN) = NaN + i NaN for non-zero x + * tanh(x +- i Inf) = NaN + i NaN for non-zero x + * tanh(0 + i NAN) = 0 + i NaN + * tanh(0 +- i Inf) = 0 + i NaN + */ + if (!std::isfinite(y)) { + if (x == realT(0)) { + return resT{q_nan, x}; + } + return resT{q_nan, q_nan}; + } + /* ordinary cases */ + return exprm_ns::tan(exprm_ns::complex(in)); // tan(in); + } + else { + static_assert(std::is_floating_point_v || + std::is_same_v); + return sycl::tan(in); + } + } +}; + +template +using TanContigFunctor = + elementwise_common::UnaryContigFunctor, + vec_sz, + n_vecs, + enable_sg_loadstore>; + +template +using TanStridedFunctor = elementwise_common:: + UnaryStridedFunctor>; + +template +struct TanOutputType +{ + using value_type = typename std::disjunction< + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry>, + td_ns::TypeMapResultEntry>, + td_ns::DefaultResultEntry>::result_type; + + static constexpr bool is_defined = !std::is_same_v; +}; + +namespace hyperparam_detail +{ + +namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; + +using vsu_ns::ContigHyperparameterSetDefault; +using vsu_ns::UnaryContigHyperparameterSetEntry; + +template +struct TanContigHyperparameterSet +{ + using value_type = + typename std::disjunction>; + + constexpr static auto vec_sz = value_type::vec_sz; + constexpr static auto n_vecs = value_type::n_vecs; +}; + +} // end of namespace hyperparam_detail + +template +class tan_contig_kernel; + +template +sycl::event tan_contig_impl(sycl::queue &exec_q, + std::size_t nelems, + const char *arg_p, + char *res_p, + const std::vector &depends = {}) +{ + using TanHS = hyperparam_detail::TanContigHyperparameterSet; + static constexpr std::uint8_t vec_sz = TanHS::vec_sz; + static constexpr std::uint8_t n_vecs = TanHS::n_vecs; + + return elementwise_common::unary_contig_impl< + argTy, TanOutputType, TanContigFunctor, tan_contig_kernel, vec_sz, + n_vecs>(exec_q, nelems, arg_p, res_p, depends); +} + +template +struct TanContigFactory +{ + fnT get() + { + if constexpr (!TanOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = tan_contig_impl; + return fn; + } + } +}; + +template +struct TanTypeMapFactory +{ + /*! @brief get typeid for output type of sycl::tan(T x) */ + std::enable_if_t::value, int> get() + { + using rT = typename TanOutputType::value_type; + return td_ns::GetTypeid{}.get(); + } +}; + +template +class tan_strided_kernel; + +template +sycl::event tan_strided_impl(sycl::queue &exec_q, + std::size_t nelems, + int nd, + const ssize_t *shape_and_strides, + const char *arg_p, + ssize_t arg_offset, + char *res_p, + ssize_t res_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + return elementwise_common::unary_strided_impl< + argTy, TanOutputType, TanStridedFunctor, tan_strided_kernel>( + exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p, + res_offset, depends, additional_depends); +} + +template +struct TanStridedFactory +{ + fnT get() + { + if constexpr (!TanOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = tan_strided_impl; + return fn; + } + } +}; + +} // namespace dpctl::tensor::kernels::tan diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/tanh.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/tanh.hpp new file mode 100644 index 000000000000..dde16128fb1a --- /dev/null +++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/tanh.hpp @@ -0,0 +1,270 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for elementwise evaluation of TANH(x) function. +//===---------------------------------------------------------------------===// + +#pragma once +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "sycl_complex.hpp" +#include "vec_size_util.hpp" + +#include "kernels/dpctl_tensor_types.hpp" +#include "kernels/elementwise_functions/common.hpp" + +#include "utils/type_dispatch_building.hpp" +#include "utils/type_utils.hpp" + +namespace dpctl::tensor::kernels::tanh +{ + +using dpctl::tensor::ssize_t; +namespace td_ns = dpctl::tensor::type_dispatch; + +using dpctl::tensor::type_utils::is_complex; + +template +struct TanhFunctor +{ + + // is function constant for given argT + using is_constant = typename std::false_type; + // constant value, if constant + // constexpr resT constant_value = resT{}; + // is function defined for sycl::vec + using supports_vec = typename std::false_type; + // do both argTy and resTy support sugroup store/load operation + using supports_sg_loadstore = typename std::negation< + std::disjunction, is_complex>>; + + resT operator()(const argT &in) const + { + if constexpr (is_complex::value) { + using realT = typename argT::value_type; + + static constexpr realT q_nan = + std::numeric_limits::quiet_NaN(); + + const realT x = std::real(in); + const realT y = std::imag(in); + /* + * tanh(NaN + i 0) = NaN + i 0 + * + * tanh(NaN + i y) = NaN + i NaN for y != 0 + * + * The imaginary part has the sign of x*sin(2*y), but there's no + * special effort to get this right. + * + * tanh(+-Inf +- i Inf) = +-1 +- 0 + * + * tanh(+-Inf + i y) = +-1 + 0 sin(2y) for y finite + * + * The imaginary part of the sign is unspecified. This special + * case is only needed to avoid a spurious invalid exception when + * y is infinite. + */ + if (!std::isfinite(x)) { + if (std::isnan(x)) { + return resT{q_nan, (y == realT(0) ? y : q_nan)}; + } + const realT res_re = sycl::copysign(realT(1), x); + const realT res_im = sycl::copysign( + realT(0), std::isinf(y) ? y : sycl::sin(y) * sycl::cos(y)); + return resT{res_re, res_im}; + } + /* + * tanh(x + i NAN) = NaN + i NaN for non-zero x + * tanh(x +- i Inf) = NaN + i NaN for non-zero x + * tanh(0 + i NAN) = 0 + i NaN + * tanh(0 +- i Inf) = 0 + i NaN + */ + if (!std::isfinite(y)) { + if (x == realT(0)) { + return resT{x, q_nan}; + } + return resT{q_nan, q_nan}; + } + /* ordinary cases */ + return exprm_ns::tanh(exprm_ns::complex(in)); // tanh(in); + } + else { + static_assert(std::is_floating_point_v || + std::is_same_v); + return sycl::tanh(in); + } + } +}; + +template +using TanhContigFunctor = + elementwise_common::UnaryContigFunctor, + vec_sz, + n_vecs, + enable_sg_loadstore>; + +template +using TanhStridedFunctor = elementwise_common:: + UnaryStridedFunctor>; + +template +struct TanhOutputType +{ + using value_type = typename std::disjunction< + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry>, + td_ns::TypeMapResultEntry>, + td_ns::DefaultResultEntry>::result_type; + + static constexpr bool is_defined = !std::is_same_v; +}; + +namespace hyperparam_detail +{ + +namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; + +using vsu_ns::ContigHyperparameterSetDefault; +using vsu_ns::UnaryContigHyperparameterSetEntry; + +template +struct TanhContigHyperparameterSet +{ + using value_type = + typename std::disjunction>; + + constexpr static auto vec_sz = value_type::vec_sz; + constexpr static auto n_vecs = value_type::n_vecs; +}; + +} // end of namespace hyperparam_detail + +template +class tanh_contig_kernel; + +template +sycl::event tanh_contig_impl(sycl::queue &exec_q, + std::size_t nelems, + const char *arg_p, + char *res_p, + const std::vector &depends = {}) +{ + using TanhHS = hyperparam_detail::TanhContigHyperparameterSet; + static constexpr std::uint8_t vec_sz = TanhHS::vec_sz; + static constexpr std::uint8_t n_vecs = TanhHS::n_vecs; + + return elementwise_common::unary_contig_impl< + argTy, TanhOutputType, TanhContigFunctor, tanh_contig_kernel, vec_sz, + n_vecs>(exec_q, nelems, arg_p, res_p, depends); +} + +template +struct TanhContigFactory +{ + fnT get() + { + if constexpr (!TanhOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = tanh_contig_impl; + return fn; + } + } +}; + +template +struct TanhTypeMapFactory +{ + /*! @brief get typeid for output type of sycl::tanh(T x) */ + std::enable_if_t::value, int> get() + { + using rT = typename TanhOutputType::value_type; + return td_ns::GetTypeid{}.get(); + } +}; + +template +class tanh_strided_kernel; + +template +sycl::event + tanh_strided_impl(sycl::queue &exec_q, + std::size_t nelems, + int nd, + const ssize_t *shape_and_strides, + const char *arg_p, + ssize_t arg_offset, + char *res_p, + ssize_t res_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + return elementwise_common::unary_strided_impl< + argTy, TanhOutputType, TanhStridedFunctor, tanh_strided_kernel>( + exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p, + res_offset, depends, additional_depends); +} + +template +struct TanhStridedFactory +{ + fnT get() + { + if constexpr (!TanhOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = tanh_strided_impl; + return fn; + } + } +}; + +} // namespace dpctl::tensor::kernels::tanh diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/true_divide.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/true_divide.hpp new file mode 100644 index 000000000000..caa1cd2029c4 --- /dev/null +++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/true_divide.hpp @@ -0,0 +1,662 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for elementwise evaluation of TRUE_DIVIDE(x1, x2) +/// function. +//===---------------------------------------------------------------------===// + +#pragma once +#include +#include +#include +#include +#include + +#include + +#include "sycl_complex.hpp" +#include "vec_size_util.hpp" + +#include "kernels/dpctl_tensor_types.hpp" +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/common_inplace.hpp" + +#include "utils/type_dispatch_building.hpp" +#include "utils/type_utils.hpp" + +namespace dpctl::tensor::kernels::true_divide +{ + +using dpctl::tensor::ssize_t; +namespace td_ns = dpctl::tensor::type_dispatch; +namespace tu_ns = dpctl::tensor::type_utils; + +template +struct TrueDivideFunctor +{ + + using supports_sg_loadstore = std::negation< + std::disjunction, tu_ns::is_complex>>; + using supports_vec = std::negation< + std::disjunction, tu_ns::is_complex>>; + + resT operator()(const argT1 &in1, const argT2 &in2) const + { + if constexpr (tu_ns::is_complex::value && + tu_ns::is_complex::value) { + using realT1 = typename argT1::value_type; + using realT2 = typename argT2::value_type; + + return exprm_ns::complex(in1) / + exprm_ns::complex(in2); + } + else if constexpr (tu_ns::is_complex::value && + !tu_ns::is_complex::value) { + using realT1 = typename argT1::value_type; + + return exprm_ns::complex(in1) / in2; + } + else if constexpr (!tu_ns::is_complex::value && + tu_ns::is_complex::value) { + using realT2 = typename argT2::value_type; + + return in1 / exprm_ns::complex(in2); + } + else { + return in1 / in2; + } + } + + template + sycl::vec + operator()(const sycl::vec &in1, + const sycl::vec &in2) const + { + auto tmp = in1 / in2; + if constexpr (std::is_same_v) { + return tmp; + } + else { + using dpctl::tensor::type_utils::vec_cast; + + return vec_cast( + tmp); + } + } +}; + +template +using TrueDivideContigFunctor = elementwise_common::BinaryContigFunctor< + argT1, + argT2, + resT, + TrueDivideFunctor, + vec_sz, + n_vecs, + enable_sg_loadstore>; + +template +using TrueDivideStridedFunctor = elementwise_common::BinaryStridedFunctor< + argT1, + argT2, + resT, + IndexerT, + TrueDivideFunctor>; + +template +struct TrueDivideOutputType +{ + using value_type = typename std::disjunction< + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + T2, + std::complex, + std::complex>, + td_ns::BinaryTypeMapResultEntry, + T2, + float, + std::complex>, + td_ns::BinaryTypeMapResultEntry, + std::complex>, + td_ns::BinaryTypeMapResultEntry, + T2, + std::complex, + std::complex>, + td_ns::BinaryTypeMapResultEntry, + std::complex>, + td_ns::BinaryTypeMapResultEntry, + T2, + double, + std::complex>, + td_ns::DefaultResultEntry>::result_type; + + static constexpr bool is_defined = !std::is_same_v; +}; + +namespace hyperparam_detail +{ + +namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; + +using vsu_ns::BinaryContigHyperparameterSetEntry; +using vsu_ns::ContigHyperparameterSetDefault; + +template +struct TrueDivideContigHyperparameterSet +{ + using value_type = + typename std::disjunction>; + + constexpr static auto vec_sz = value_type::vec_sz; + constexpr static auto n_vecs = value_type::n_vecs; +}; + +} // end of namespace hyperparam_detail + +template +class true_divide_contig_kernel; + +template +sycl::event + true_divide_contig_impl(sycl::queue &exec_q, + std::size_t nelems, + const char *arg1_p, + ssize_t arg1_offset, + const char *arg2_p, + ssize_t arg2_offset, + char *res_p, + ssize_t res_offset, + const std::vector &depends = {}) +{ + using DivHS = + hyperparam_detail::TrueDivideContigHyperparameterSet; + static constexpr std::uint8_t vec_sz = DivHS::vec_sz; + static constexpr std::uint8_t n_vecs = DivHS::n_vecs; + + return elementwise_common::binary_contig_impl< + argTy1, argTy2, TrueDivideOutputType, TrueDivideContigFunctor, + true_divide_contig_kernel, vec_sz, n_vecs>( + exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p, + res_offset, depends); +} + +template +struct TrueDivideContigFactory +{ + fnT get() + { + if constexpr (!TrueDivideOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = true_divide_contig_impl; + return fn; + } + } +}; + +template +struct TrueDivideTypeMapFactory +{ + /*! @brief get typeid for output type of divide(T1 x, T2 y) */ + std::enable_if_t::value, int> get() + { + using rT = typename TrueDivideOutputType::value_type; + return td_ns::GetTypeid{}.get(); + } +}; + +template +class true_divide_strided_kernel; + +template +sycl::event + true_divide_strided_impl(sycl::queue &exec_q, + std::size_t nelems, + int nd, + const ssize_t *shape_and_strides, + const char *arg1_p, + ssize_t arg1_offset, + const char *arg2_p, + ssize_t arg2_offset, + char *res_p, + ssize_t res_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + return elementwise_common::binary_strided_impl< + argTy1, argTy2, TrueDivideOutputType, TrueDivideStridedFunctor, + true_divide_strided_kernel>( + exec_q, nelems, nd, shape_and_strides, arg1_p, arg1_offset, arg2_p, + arg2_offset, res_p, res_offset, depends, additional_depends); +} + +template +struct TrueDivideStridedFactory +{ + fnT get() + { + if constexpr (!TrueDivideOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = true_divide_strided_impl; + return fn; + } + } +}; + +template +using TrueDivideContigMatrixContigRowBroadcastingFunctor = + elementwise_common::BinaryContigMatrixContigRowBroadcastingFunctor< + argT1, + argT2, + resT, + TrueDivideFunctor>; + +template +using TrueDivideContigRowContigMatrixBroadcastingFunctor = + elementwise_common::BinaryContigRowContigMatrixBroadcastingFunctor< + argT1, + argT2, + resT, + TrueDivideFunctor>; + +template +class true_divide_matrix_row_broadcast_sg_krn; + +template +class true_divide_row_matrix_broadcast_sg_krn; + +template +sycl::event true_divide_contig_matrix_contig_row_broadcast_impl( + sycl::queue &exec_q, + std::vector &host_tasks, + std::size_t n0, + std::size_t n1, + const char *mat_p, // typeless pointer to (n0, n1) C-contiguous matrix + ssize_t mat_offset, + const char *vec_p, // typeless pointer to (n1,) contiguous row + ssize_t vec_offset, + char *res_p, // typeless pointer to (n0, n1) result C-contig. matrix, + // res[i,j] = mat[i,j] / vec[j] + ssize_t res_offset, + const std::vector &depends = {}) +{ + return elementwise_common::binary_contig_matrix_contig_row_broadcast_impl< + argT1, argT2, resT, TrueDivideContigMatrixContigRowBroadcastingFunctor, + true_divide_matrix_row_broadcast_sg_krn>( + exec_q, host_tasks, n0, n1, mat_p, mat_offset, vec_p, vec_offset, res_p, + res_offset, depends); +} + +template +struct TrueDivideContigMatrixContigRowBroadcastFactory +{ + fnT get() + { + if constexpr (!TrueDivideOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + using resT = typename TrueDivideOutputType::value_type; + if constexpr (dpctl::tensor::type_utils::is_complex::value || + dpctl::tensor::type_utils::is_complex::value || + dpctl::tensor::type_utils::is_complex::value) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = + true_divide_contig_matrix_contig_row_broadcast_impl; + return fn; + } + } + } +}; + +template +sycl::event true_divide_contig_row_contig_matrix_broadcast_impl( + sycl::queue &exec_q, + std::vector &host_tasks, + std::size_t n0, + std::size_t n1, + const char *vec_p, // typeless pointer to (n1,) contiguous row + ssize_t vec_offset, + const char *mat_p, // typeless pointer to (n0, n1) C-contiguous matrix + ssize_t mat_offset, + char *res_p, // typeless pointer to (n0, n1) result C-contig. matrix, + // res[i,j] = mat[i,j] + vec[j] + ssize_t res_offset, + const std::vector &depends = {}) +{ + return elementwise_common::binary_contig_row_contig_matrix_broadcast_impl< + argT1, argT2, resT, TrueDivideContigRowContigMatrixBroadcastingFunctor, + true_divide_row_matrix_broadcast_sg_krn>( + exec_q, host_tasks, n0, n1, vec_p, vec_offset, mat_p, mat_offset, res_p, + res_offset, depends); +}; + +template +struct TrueDivideContigRowContigMatrixBroadcastFactory +{ + fnT get() + { + if constexpr (!TrueDivideOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + using resT = typename TrueDivideOutputType::value_type; + if constexpr (dpctl::tensor::type_utils::is_complex::value || + dpctl::tensor::type_utils::is_complex::value || + dpctl::tensor::type_utils::is_complex::value) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = + true_divide_contig_row_contig_matrix_broadcast_impl; + return fn; + } + } + } +}; + +template +struct TrueDivideInplaceFunctor +{ + + using supports_sg_loadstore = std::negation< + std::disjunction, tu_ns::is_complex>>; + using supports_vec = std::negation< + std::disjunction, tu_ns::is_complex>>; + + void operator()(resT &res, const argT &in) + { + if constexpr (tu_ns::is_complex::value) { + if constexpr (tu_ns::is_complex::value) { + using res_rT = typename resT::value_type; + using arg_rT = typename argT::value_type; + + auto res1 = exprm_ns::complex(res); + res1 /= exprm_ns::complex(in); + res = res1; + } + else { + using res_rT = typename resT::value_type; + + auto res1 = exprm_ns::complex(res); + res1 /= in; + res = res1; + } + } + else { + res /= in; + } + } + + template + void operator()(sycl::vec &res, + const sycl::vec &in) + { + res /= in; + } +}; + +/* @brief Types supported by in-place divide */ +template +struct TrueDivideInplaceTypePairSupport +{ + + /* value if true a kernel for must be instantiated */ + static constexpr bool is_defined = std::disjunction< + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry>, + td_ns::TypePairDefinedEntry, + resTy, + std::complex>, + td_ns::TypePairDefinedEntry>, + td_ns::TypePairDefinedEntry, + resTy, + std::complex>, + // fall-through + td_ns::NotDefinedEntry>::is_defined; +}; + +template +struct TrueDivideInplaceTypeMapFactory +{ + /*! @brief get typeid for output type of divide(T1 x, T2 y) */ + std::enable_if_t::value, int> get() + { + if constexpr (TrueDivideInplaceTypePairSupport::is_defined) { + return td_ns::GetTypeid{}.get(); + } + else { + return td_ns::GetTypeid{}.get(); + } + } +}; + +template +using TrueDivideInplaceContigFunctor = + elementwise_common::BinaryInplaceContigFunctor< + argT, + resT, + TrueDivideInplaceFunctor, + vec_sz, + n_vecs, + enable_sg_loadstore>; + +template +using TrueDivideInplaceStridedFunctor = + elementwise_common::BinaryInplaceStridedFunctor< + argT, + resT, + IndexerT, + TrueDivideInplaceFunctor>; + +template +class true_divide_inplace_contig_kernel; + +template +sycl::event true_divide_inplace_contig_impl( + sycl::queue &exec_q, + std::size_t nelems, + const char *arg_p, + ssize_t arg_offset, + char *res_p, + ssize_t res_offset, + const std::vector &depends = {}) +{ + using DivHS = + hyperparam_detail::TrueDivideContigHyperparameterSet; + static constexpr std::uint8_t vec_sz = DivHS::vec_sz; + static constexpr std::uint8_t n_vecs = DivHS::vec_sz; + + return elementwise_common::binary_inplace_contig_impl< + argTy, resTy, TrueDivideInplaceContigFunctor, + true_divide_inplace_contig_kernel, vec_sz, n_vecs>( + exec_q, nelems, arg_p, arg_offset, res_p, res_offset, depends); +} + +template +struct TrueDivideInplaceContigFactory +{ + fnT get() + { + if constexpr (!TrueDivideInplaceTypePairSupport::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = true_divide_inplace_contig_impl; + return fn; + } + } +}; + +template +class true_divide_inplace_strided_kernel; + +template +sycl::event true_divide_inplace_strided_impl( + sycl::queue &exec_q, + std::size_t nelems, + int nd, + const ssize_t *shape_and_strides, + const char *arg_p, + ssize_t arg_offset, + char *res_p, + ssize_t res_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + return elementwise_common::binary_inplace_strided_impl< + argTy, resTy, TrueDivideInplaceStridedFunctor, + true_divide_inplace_strided_kernel>( + exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p, + res_offset, depends, additional_depends); +} + +template +struct TrueDivideInplaceStridedFactory +{ + fnT get() + { + if constexpr (!TrueDivideInplaceTypePairSupport::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = true_divide_inplace_strided_impl; + return fn; + } + } +}; + +template +class true_divide_inplace_row_matrix_broadcast_sg_krn; + +template +using TrueDivideInplaceRowMatrixBroadcastingFunctor = + elementwise_common::BinaryInplaceRowMatrixBroadcastingFunctor< + argT, + resT, + TrueDivideInplaceFunctor>; + +template +sycl::event true_divide_inplace_row_matrix_broadcast_impl( + sycl::queue &exec_q, + std::vector &host_tasks, + std::size_t n0, + std::size_t n1, + const char *vec_p, // typeless pointer to (n1,) contiguous row + ssize_t vec_offset, + char *mat_p, // typeless pointer to (n0, n1) C-contiguous matrix + ssize_t mat_offset, + const std::vector &depends = {}) +{ + return elementwise_common::binary_inplace_row_matrix_broadcast_impl< + argT, resT, TrueDivideInplaceRowMatrixBroadcastingFunctor, + true_divide_inplace_row_matrix_broadcast_sg_krn>( + exec_q, host_tasks, n0, n1, vec_p, vec_offset, mat_p, mat_offset, + depends); +} + +template +struct TrueDivideInplaceRowMatrixBroadcastFactory +{ + fnT get() + { + if constexpr (!TrueDivideInplaceTypePairSupport::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + if constexpr (dpctl::tensor::type_utils::is_complex::value || + dpctl::tensor::type_utils::is_complex::value) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = true_divide_inplace_row_matrix_broadcast_impl; + return fn; + } + } + } +}; + +} // namespace dpctl::tensor::kernels::true_divide diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/trunc.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/trunc.hpp new file mode 100644 index 000000000000..6fae9c4f27e5 --- /dev/null +++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/trunc.hpp @@ -0,0 +1,226 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for elementwise evaluation of TRUNC(x) function. +//===---------------------------------------------------------------------===// + +#pragma once +#include +#include +#include +#include + +#include + +#include "vec_size_util.hpp" + +#include "kernels/dpctl_tensor_types.hpp" +#include "kernels/elementwise_functions/common.hpp" + +#include "utils/type_dispatch_building.hpp" +#include "utils/type_utils.hpp" + +namespace dpctl::tensor::kernels::trunc +{ + +using dpctl::tensor::ssize_t; +namespace td_ns = dpctl::tensor::type_dispatch; + +using dpctl::tensor::type_utils::is_complex; + +template +struct TruncFunctor +{ + + // is function constant for given argT + using is_constant = typename std::false_type; + // constant value, if constant + // constexpr resT constant_value = resT{}; + // is function defined for sycl::vec + using supports_vec = typename std::false_type; + // do both argTy and resTy support sugroup store/load operation + using supports_sg_loadstore = typename std::negation< + std::disjunction, is_complex>>; + + resT operator()(const argT &in) const + { + if constexpr (std::is_integral_v) { + return in; + } + else { + return sycl::trunc(in); + } + } +}; + +template +using TruncContigFunctor = + elementwise_common::UnaryContigFunctor, + vec_sz, + n_vecs, + enable_sg_loadstore>; + +template +using TruncStridedFunctor = elementwise_common:: + UnaryStridedFunctor>; + +template +struct TruncOutputType +{ + using value_type = + typename std::disjunction, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::DefaultResultEntry>::result_type; + + static constexpr bool is_defined = !std::is_same_v; +}; + +namespace hyperparam_detail +{ + +namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; + +using vsu_ns::ContigHyperparameterSetDefault; +using vsu_ns::UnaryContigHyperparameterSetEntry; + +template +struct TruncContigHyperparameterSet +{ + using value_type = + typename std::disjunction>; + + constexpr static auto vec_sz = value_type::vec_sz; + constexpr static auto n_vecs = value_type::n_vecs; +}; + +} // end of namespace hyperparam_detail + +template +class trunc_contig_kernel; + +template +sycl::event trunc_contig_impl(sycl::queue &exec_q, + std::size_t nelems, + const char *arg_p, + char *res_p, + const std::vector &depends = {}) +{ + using TruncHS = hyperparam_detail::TruncContigHyperparameterSet; + static constexpr std::uint8_t vec_sz = TruncHS::vec_sz; + static constexpr std::uint8_t n_vecs = TruncHS::n_vecs; + + return elementwise_common::unary_contig_impl< + argTy, TruncOutputType, TruncContigFunctor, trunc_contig_kernel, vec_sz, + n_vecs>(exec_q, nelems, arg_p, res_p, depends); +} + +template +struct TruncContigFactory +{ + fnT get() + { + if constexpr (!TruncOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = trunc_contig_impl; + return fn; + } + } +}; + +template +struct TruncTypeMapFactory +{ + /*! @brief get typeid for output type of sycl::trunc(T x) */ + std::enable_if_t::value, int> get() + { + using rT = typename TruncOutputType::value_type; + return td_ns::GetTypeid{}.get(); + } +}; + +template +class trunc_strided_kernel; + +template +sycl::event + trunc_strided_impl(sycl::queue &exec_q, + std::size_t nelems, + int nd, + const ssize_t *shape_and_strides, + const char *arg_p, + ssize_t arg_offset, + char *res_p, + ssize_t res_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + return elementwise_common::unary_strided_impl< + argTy, TruncOutputType, TruncStridedFunctor, trunc_strided_kernel>( + exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p, + res_offset, depends, additional_depends); +} + +template +struct TruncStridedFactory +{ + fnT get() + { + if constexpr (!TruncOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = trunc_strided_impl; + return fn; + } + } +}; + +} // namespace dpctl::tensor::kernels::trunc diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/vec_size_util.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/vec_size_util.hpp new file mode 100644 index 000000000000..bdbc7e50cc86 --- /dev/null +++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/vec_size_util.hpp @@ -0,0 +1,70 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +/// +/// \file +/// This file defines utilities for selection of hyperparameters for kernels +/// implementing unary and binary elementwise functions for contiguous inputs +//===---------------------------------------------------------------------===// + +#pragma once + +#include +#include + +namespace dpctl::tensor::kernels::vec_size_utils +{ +template +struct BinaryContigHyperparameterSetEntry + : std::conjunction, std::is_same> +{ + static constexpr std::uint8_t vec_sz = vec_sz_v; + static constexpr std::uint8_t n_vecs = n_vecs_v; +}; + +template +struct UnaryContigHyperparameterSetEntry : std::is_same +{ + static constexpr std::uint8_t vec_sz = vec_sz_v; + static constexpr std::uint8_t n_vecs = n_vecs_v; +}; + +template +struct ContigHyperparameterSetDefault : std::true_type +{ + static constexpr std::uint8_t vec_sz = vec_sz_v; + static constexpr std::uint8_t n_vecs = n_vecs_v; +}; +} // namespace dpctl::tensor::kernels::vec_size_utils diff --git a/dpnp/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp b/dpnp/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp new file mode 100644 index 000000000000..f6d2f0175ce8 --- /dev/null +++ b/dpnp/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp @@ -0,0 +1,418 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for advanced tensor index operations. +//===----------------------------------------------------------------------===// + +#pragma once + +#include +#include +#include + +#include + +#include "dpctl_tensor_types.hpp" +#include "utils/indexing_utils.hpp" +#include "utils/offset_utils.hpp" +#include "utils/type_utils.hpp" + +namespace dpctl::tensor::kernels::indexing +{ + +using dpctl::tensor::ssize_t; + +template +class TakeFunctor +{ +private: + const char *src_ = nullptr; + char *dst_ = nullptr; + char **ind_ = nullptr; + int k_ = 0; + std::size_t ind_nelems_ = 0; + const ssize_t *axes_shape_and_strides_ = nullptr; + OrthogIndexer orthog_strider; + IndicesIndexer ind_strider; + AxesIndexer axes_strider; + +public: + TakeFunctor(const char *src_cp, + char *dst_cp, + char **ind_cp, + int k, + std::size_t ind_nelems, + const ssize_t *axes_shape_and_strides, + const OrthogIndexer &orthog_strider_, + const IndicesIndexer &ind_strider_, + const AxesIndexer &axes_strider_) + : src_(src_cp), dst_(dst_cp), ind_(ind_cp), k_(k), + ind_nelems_(ind_nelems), + axes_shape_and_strides_(axes_shape_and_strides), + orthog_strider(orthog_strider_), ind_strider(ind_strider_), + axes_strider(axes_strider_) + { + } + + void operator()(sycl::id<1> id) const + { + const T *src = reinterpret_cast(src_); + T *dst = reinterpret_cast(dst_); + + ssize_t i_orthog = id / ind_nelems_; + ssize_t i_along = id - (i_orthog * ind_nelems_); + + auto orthog_offsets = orthog_strider(i_orthog); + + ssize_t src_offset = orthog_offsets.get_first_offset(); + ssize_t dst_offset = orthog_offsets.get_second_offset(); + + static constexpr ProjectorT proj{}; + for (int axis_idx = 0; axis_idx < k_; ++axis_idx) { + indT *ind_data = reinterpret_cast(ind_[axis_idx]); + + ssize_t ind_offset = ind_strider(i_along, axis_idx); + // proj produces an index in the range of the given axis + ssize_t projected_idx = + proj(axes_shape_and_strides_[axis_idx], ind_data[ind_offset]); + src_offset += + projected_idx * axes_shape_and_strides_[k_ + axis_idx]; + } + + dst_offset += axes_strider(i_along); + + dst[dst_offset] = src[src_offset]; + } +}; + +template +class take_kernel; + +typedef sycl::event (*take_fn_ptr_t)(sycl::queue &, + std::size_t, + std::size_t, + int, + int, + int, + const ssize_t *, + const ssize_t *, + const ssize_t *, + const char *, + char *, + char **, + ssize_t, + ssize_t, + const ssize_t *, + const std::vector &); + +template +sycl::event take_impl(sycl::queue &q, + std::size_t orthog_nelems, + std::size_t ind_nelems, + int nd, + int ind_nd, + int k, + const ssize_t *orthog_shape_and_strides, + const ssize_t *axes_shape_and_strides, + const ssize_t *ind_shape_and_strides, + const char *src_p, + char *dst_p, + char **ind_p, + ssize_t src_offset, + ssize_t dst_offset, + const ssize_t *ind_offsets, + const std::vector &depends) +{ + dpctl::tensor::type_utils::validate_type_for_device(q); + + sycl::event take_ev = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + using OrthogIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer; + const OrthogIndexerT orthog_indexer{nd, src_offset, dst_offset, + orthog_shape_and_strides}; + + using NthStrideIndexerT = dpctl::tensor::offset_utils::NthStrideOffset; + const NthStrideIndexerT indices_indexer{ind_nd, ind_offsets, + ind_shape_and_strides}; + + using AxesIndexerT = dpctl::tensor::offset_utils::StridedIndexer; + const AxesIndexerT axes_indexer{ind_nd, 0, + axes_shape_and_strides + (2 * k)}; + + using KernelName = + take_kernel; + + const std::size_t gws = orthog_nelems * ind_nelems; + + cgh.parallel_for( + sycl::range<1>(gws), + TakeFunctor( + src_p, dst_p, ind_p, k, ind_nelems, axes_shape_and_strides, + orthog_indexer, indices_indexer, axes_indexer)); + }); + + return take_ev; +} + +template +class PutFunctor +{ +private: + char *dst_ = nullptr; + const char *val_ = nullptr; + char **ind_ = nullptr; + int k_ = 0; + std::size_t ind_nelems_ = 0; + const ssize_t *axes_shape_and_strides_ = nullptr; + OrthogIndexer orthog_strider; + IndicesIndexer ind_strider; + AxesIndexer axes_strider; + +public: + PutFunctor(char *dst_cp, + const char *val_cp, + char **ind_cp, + int k, + std::size_t ind_nelems, + const ssize_t *axes_shape_and_strides, + const OrthogIndexer &orthog_strider_, + const IndicesIndexer &ind_strider_, + const AxesIndexer &axes_strider_) + : dst_(dst_cp), val_(val_cp), ind_(ind_cp), k_(k), + ind_nelems_(ind_nelems), + axes_shape_and_strides_(axes_shape_and_strides), + orthog_strider(orthog_strider_), ind_strider(ind_strider_), + axes_strider(axes_strider_) + { + } + + void operator()(sycl::id<1> id) const + { + T *dst = reinterpret_cast(dst_); + const T *val = reinterpret_cast(val_); + + ssize_t i_orthog = id / ind_nelems_; + ssize_t i_along = id - (i_orthog * ind_nelems_); + + auto orthog_offsets = orthog_strider(i_orthog); + + ssize_t dst_offset = orthog_offsets.get_first_offset(); + ssize_t val_offset = orthog_offsets.get_second_offset(); + + static constexpr ProjectorT proj{}; + for (int axis_idx = 0; axis_idx < k_; ++axis_idx) { + indT *ind_data = reinterpret_cast(ind_[axis_idx]); + + ssize_t ind_offset = ind_strider(i_along, axis_idx); + + // proj produces an index in the range of the given axis + ssize_t projected_idx = + proj(axes_shape_and_strides_[axis_idx], ind_data[ind_offset]); + dst_offset += + projected_idx * axes_shape_and_strides_[k_ + axis_idx]; + } + + val_offset += axes_strider(i_along); + + dst[dst_offset] = val[val_offset]; + } +}; + +template +class put_kernel; + +typedef sycl::event (*put_fn_ptr_t)(sycl::queue &, + std::size_t, + std::size_t, + int, + int, + int, + const ssize_t *, + const ssize_t *, + const ssize_t *, + char *, + const char *, + char **, + ssize_t, + ssize_t, + const ssize_t *, + const std::vector &); + +template +sycl::event put_impl(sycl::queue &q, + std::size_t orthog_nelems, + std::size_t ind_nelems, + int nd, + int ind_nd, + int k, + const ssize_t *orthog_shape_and_strides, + const ssize_t *axes_shape_and_strides, + const ssize_t *ind_shape_and_strides, + char *dst_p, + const char *val_p, + char **ind_p, + ssize_t dst_offset, + ssize_t val_offset, + const ssize_t *ind_offsets, + const std::vector &depends) +{ + dpctl::tensor::type_utils::validate_type_for_device(q); + + sycl::event put_ev = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + using OrthogIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer; + const OrthogIndexerT orthog_indexer{nd, dst_offset, val_offset, + orthog_shape_and_strides}; + + using NthStrideIndexerT = dpctl::tensor::offset_utils::NthStrideOffset; + const NthStrideIndexerT indices_indexer{ind_nd, ind_offsets, + ind_shape_and_strides}; + + using AxesIndexerT = dpctl::tensor::offset_utils::StridedIndexer; + const AxesIndexerT axes_indexer{ind_nd, 0, + axes_shape_and_strides + (2 * k)}; + + using KernelName = + put_kernel; + + const std::size_t gws = orthog_nelems * ind_nelems; + + cgh.parallel_for( + sycl::range<1>(gws), + PutFunctor( + dst_p, val_p, ind_p, k, ind_nelems, axes_shape_and_strides, + orthog_indexer, indices_indexer, axes_indexer)); + }); + + return put_ev; +} + +template +struct TakeWrapFactory +{ + fnT get() + { + if constexpr (std::is_integral::value && + !std::is_same::value) { + using dpctl::tensor::indexing_utils::WrapIndex; + fnT fn = take_impl, T, indT>; + return fn; + } + else { + fnT fn = nullptr; + return fn; + } + } +}; + +template +struct TakeClipFactory +{ + fnT get() + { + if constexpr (std::is_integral::value && + !std::is_same::value) { + using dpctl::tensor::indexing_utils::ClipIndex; + fnT fn = take_impl, T, indT>; + return fn; + } + else { + fnT fn = nullptr; + return fn; + } + } +}; + +template +struct PutWrapFactory +{ + fnT get() + { + if constexpr (std::is_integral::value && + !std::is_same::value) { + using dpctl::tensor::indexing_utils::WrapIndex; + fnT fn = put_impl, T, indT>; + return fn; + } + else { + fnT fn = nullptr; + return fn; + } + } +}; + +template +struct PutClipFactory +{ + fnT get() + { + if constexpr (std::is_integral::value && + !std::is_same::value) { + using dpctl::tensor::indexing_utils::ClipIndex; + fnT fn = put_impl, T, indT>; + return fn; + } + else { + fnT fn = nullptr; + return fn; + } + } +}; + +} // namespace dpctl::tensor::kernels::indexing diff --git a/dpnp/tensor/libtensor/include/kernels/linalg_functions/dot_product.hpp b/dpnp/tensor/libtensor/include/kernels/linalg_functions/dot_product.hpp new file mode 100644 index 000000000000..b987ff2988be --- /dev/null +++ b/dpnp/tensor/libtensor/include/kernels/linalg_functions/dot_product.hpp @@ -0,0 +1,1399 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for the vector dot product. +//===----------------------------------------------------------------------===// + +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#include "kernels/dpctl_tensor_types.hpp" +#include "kernels/reductions.hpp" +#include "utils/offset_utils.hpp" +#include "utils/sycl_alloc_utils.hpp" +#include "utils/sycl_utils.hpp" +#include "utils/type_utils.hpp" + +namespace dpctl::tensor::kernels +{ + +using dpctl::tensor::ssize_t; +namespace su_ns = dpctl::tensor::sycl_utils; + +template +struct SequentialDotProduct +{ +private: + const lhsT *lhs_ = nullptr; + const rhsT *rhs_ = nullptr; + outT *out_ = nullptr; + BatchIndexerT batch_indexer_; + RedIndexerT reduced_dims_indexer_; + std::size_t reduction_max_gid_ = 0; + +public: + SequentialDotProduct(const lhsT *lhs, + const rhsT *rhs, + outT *out, + BatchIndexerT batch_indexer, + RedIndexerT reduced_dims_indexer, + std::size_t reduction_size) + : lhs_(lhs), rhs_(rhs), out_(out), batch_indexer_(batch_indexer), + reduced_dims_indexer_(reduced_dims_indexer), + reduction_max_gid_(reduction_size) + { + } + + void operator()(sycl::id<1> id) const + { + + auto const &batch_offsets = batch_indexer_(id[0]); + const ssize_t &lhs_batch_offset = batch_offsets.get_first_offset(); + const ssize_t &rhs_batch_offset = batch_offsets.get_second_offset(); + const ssize_t &out_batch_offset = batch_offsets.get_third_offset(); + + outT red_val(0); + for (std::size_t m = 0; m < reduction_max_gid_; ++m) { + auto reduction_offsets = reduced_dims_indexer_(m); + auto lhs_reduction_offset = reduction_offsets.get_first_offset(); + auto rhs_reduction_offset = reduction_offsets.get_second_offset(); + + using dpctl::tensor::type_utils::convert_impl; + red_val += convert_impl( + lhs_[lhs_batch_offset + lhs_reduction_offset]) * + convert_impl( + rhs_[rhs_batch_offset + rhs_reduction_offset]); + } + + out_[out_batch_offset] = red_val; + } +}; + +template +struct DotProductFunctor +{ +private: + const lhsT *lhs_ = nullptr; + const rhsT *rhs_ = nullptr; + outT *out_ = nullptr; + ReductionOpT reduction_op_; + BatchIndexerT batch_indexer_; + RedIndexerT reduced_dims_indexer_; + std::size_t reduction_max_gid_ = 0; + std::size_t batches_ = 1; + std::size_t reductions_per_wi = 16; + +public: + DotProductFunctor(const lhsT *lhs, + const rhsT *rhs, + outT *res, + const ReductionOpT &reduction_op, + const BatchIndexerT &batch_indexer, + const RedIndexerT &arg_reduced_dims_indexer, + std::size_t reduction_size, + std::size_t iteration_size, + std::size_t reduction_size_per_wi) + : lhs_(lhs), rhs_(rhs), out_(res), reduction_op_(reduction_op), + batch_indexer_(batch_indexer), + reduced_dims_indexer_(arg_reduced_dims_indexer), + reduction_max_gid_(reduction_size), batches_(iteration_size), + reductions_per_wi(reduction_size_per_wi) + { + } + + void operator()(sycl::nd_item<1> it) const + { + const std::size_t batch_id = it.get_group(0) % batches_; + const std::size_t reduction_batch_id = it.get_group(0) / batches_; + + const std::size_t reduction_lid = it.get_local_id(0); + const std::size_t wg = + it.get_local_range(0); // 0 <= reduction_lid < wg + + // work-items operate over input with indices + // inp_data_id = reduction_batch_id * wg * reductions_per_wi + m * wg + // + reduction_lid + // for 0 <= m < reductions_per_wi + // for each input + + const auto &batch_offsets_ = batch_indexer_(batch_id); + const auto &lhs_batch_offset = batch_offsets_.get_first_offset(); + const auto &rhs_batch_offset = batch_offsets_.get_second_offset(); + const auto &out_batch_offset = batch_offsets_.get_third_offset(); + + outT local_red_val(0); + std::size_t arg_reduce_gid0 = + reduction_lid + reduction_batch_id * wg * reductions_per_wi; + std::size_t arg_reduce_gid_max = std::min( + reduction_max_gid_, arg_reduce_gid0 + reductions_per_wi * wg); + + for (std::size_t arg_reduce_gid = arg_reduce_gid0; + arg_reduce_gid < arg_reduce_gid_max; arg_reduce_gid += wg) { + auto reduction_offsets_ = reduced_dims_indexer_(arg_reduce_gid); + const auto &lhs_reduction_offset = + reduction_offsets_.get_first_offset(); + const auto &rhs_reduction_offset = + reduction_offsets_.get_second_offset(); + + using dpctl::tensor::type_utils::convert_impl; + outT val = convert_impl( + lhs_[lhs_batch_offset + lhs_reduction_offset]) * + convert_impl( + rhs_[rhs_batch_offset + rhs_reduction_offset]); + + local_red_val += val; + } + + auto work_group = it.get_group(); + outT red_val_over_wg = sycl::reduce_over_group( + work_group, local_red_val, outT(0), reduction_op_); + + if (work_group.leader()) { + sycl::atomic_ref + res_ref(out_[out_batch_offset]); + res_ref += red_val_over_wg; + } + } +}; + +template +struct DotProductCustomFunctor +{ +private: + const lhsT *lhs_ = nullptr; + const rhsT *rhs_ = nullptr; + outT *out_ = nullptr; + ReductionOpT reduction_op_; + BatchIndexerT batch_indexer_; + RedIndexerT reduced_dims_indexer_; + SlmT local_mem_; + std::size_t reduction_max_gid_ = 0; + std::size_t batches_ = 1; + std::size_t reductions_per_wi = 16; + +public: + DotProductCustomFunctor(const lhsT *lhs, + const rhsT *rhs, + outT *res, + const ReductionOpT &reduction_op, + const BatchIndexerT &batch_indexer, + const RedIndexerT &arg_reduced_dims_indexer, + SlmT local_mem, + std::size_t reduction_size, + std::size_t iteration_size, + std::size_t reduction_size_per_wi) + : lhs_(lhs), rhs_(rhs), out_(res), reduction_op_(reduction_op), + batch_indexer_(batch_indexer), + reduced_dims_indexer_(arg_reduced_dims_indexer), + local_mem_(local_mem), reduction_max_gid_(reduction_size), + batches_(iteration_size), reductions_per_wi(reduction_size_per_wi) + { + } + + void operator()(sycl::nd_item<1> it) const + { + const std::size_t batch_id = it.get_group(0) % batches_; + const std::size_t reduction_batch_id = it.get_group(0) / batches_; + + const std::size_t reduction_lid = it.get_local_id(0); + const std::size_t wg = + it.get_local_range(0); // 0 <= reduction_lid < wg + + // work-items operate over input with indices + // inp_data_id = reduction_batch_id * wg * reductions_per_wi + m * wg + // + reduction_lid + // for 0 <= m < reductions_per_wi + // for each input + + const auto &batch_offsets_ = batch_indexer_(batch_id); + const auto &lhs_batch_offset = batch_offsets_.get_first_offset(); + const auto &rhs_batch_offset = batch_offsets_.get_second_offset(); + const auto &out_batch_offset = batch_offsets_.get_third_offset(); + + outT local_red_val(0); + std::size_t arg_reduce_gid0 = + reduction_lid + reduction_batch_id * wg * reductions_per_wi; + std::size_t arg_reduce_gid_max = std::min( + reduction_max_gid_, arg_reduce_gid0 + reductions_per_wi * wg); + + for (std::size_t arg_reduce_gid = arg_reduce_gid0; + arg_reduce_gid < arg_reduce_gid_max; arg_reduce_gid += wg) { + auto reduction_offsets_ = reduced_dims_indexer_(arg_reduce_gid); + const auto &lhs_reduction_offset = + reduction_offsets_.get_first_offset(); + const auto &rhs_reduction_offset = + reduction_offsets_.get_second_offset(); + + using dpctl::tensor::type_utils::convert_impl; + outT val = convert_impl( + lhs_[lhs_batch_offset + lhs_reduction_offset]) * + convert_impl( + rhs_[rhs_batch_offset + rhs_reduction_offset]); + + local_red_val += val; + } + + auto work_group = it.get_group(); + outT red_val_over_wg = su_ns::custom_reduce_over_group( + work_group, local_mem_, local_red_val, reduction_op_); + + if (work_group.leader()) { + sycl::atomic_ref + res_ref(out_[out_batch_offset]); + res_ref += red_val_over_wg; + } + } +}; + +template class kernel_name_token> +sycl::event sequential_dot_product(sycl::queue &exec_q, + const lhsTy *lhs, + const rhsTy *rhs, + resTy *res, + std::size_t batches, + std::size_t reduction_nelems, + const BatchIndexerT &batch_indexer, + const RedIndexerT &reduction_indexer, + const std::vector &depends) +{ + sycl::event dot_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + cgh.parallel_for< + kernel_name_token>( + sycl::range<1>(batches), + SequentialDotProduct(lhs, rhs, res, batch_indexer, + reduction_indexer, + reduction_nelems)); + }); + + return dot_ev; +} + +template class kernel_name_token> +sycl::event submit_atomic_dot_product(sycl::queue &exec_q, + const lhsTy *lhs, + const rhsTy *rhs, + resTy *res, + std::size_t wg, + std::size_t batches, + std::size_t reduction_nelems, + std::size_t reductions_per_wi, + std::size_t reduction_groups, + const BatchIndexerT &batch_indexer, + const RedIndexerT &reduction_indexer, + const std::vector &depends) +{ + sycl::event dot_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + auto globalRange = sycl::range<1>{batches * reduction_groups * wg}; + auto localRange = sycl::range<1>{wg}; + auto ndRange = sycl::nd_range<1>(globalRange, localRange); + + if constexpr (can_use_reduce_over_group::value) { + using KernelName = + class kernel_name_token; + + cgh.parallel_for( + ndRange, DotProductFunctor( + lhs, rhs, res, ReductionOpT(), batch_indexer, + reduction_indexer, reduction_nelems, batches, + reductions_per_wi)); + } + else { + using SlmT = sycl::local_accessor; + SlmT local_memory = SlmT(localRange, cgh); + + using KernelName = class custom_reduction_wrapper>; + + cgh.parallel_for( + ndRange, + DotProductCustomFunctor( + lhs, rhs, res, ReductionOpT(), batch_indexer, + reduction_indexer, local_memory, reduction_nelems, batches, + reductions_per_wi)); + } + }); + return dot_ev; +} + +template +class dot_product_seq_krn; + +template +class dot_product_init_krn; + +template +class dot_product_krn; + +typedef sycl::event (*dot_product_impl_fn_ptr_t)( + sycl::queue &, + std::size_t, + std::size_t, + const char *, + const char *, + char *, + int, + const ssize_t *, + ssize_t, + ssize_t, + ssize_t, + int, + const ssize_t *, + ssize_t, + ssize_t, + const std::vector &); + +template +sycl::event dot_product_impl(sycl::queue &exec_q, + std::size_t batches, + std::size_t reduction_nelems, + const char *lhs_cp, + const char *rhs_cp, + char *res_cp, + int batch_nd, + const ssize_t *batch_shape_and_strides, + ssize_t batch_lhs_offset, + ssize_t batch_rhs_offset, + ssize_t batch_res_offset, + int red_nd, + const ssize_t *reduction_shape_stride, + ssize_t reduction_lhs_offset, + ssize_t reduction_rhs_offset, + const std::vector &depends = {}) +{ + const lhsTy *lhs_tp = reinterpret_cast(lhs_cp); + const rhsTy *rhs_tp = reinterpret_cast(rhs_cp); + resTy *res_tp = reinterpret_cast(res_cp); + + const sycl::device &d = exec_q.get_device(); + const auto &sg_sizes = d.get_info(); + std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); + + if (reduction_nelems < wg) { + using InputOutputBatchIndexerT = + dpctl::tensor::offset_utils::ThreeOffsets_StridedIndexer; + using ReductionIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer; + + const InputOutputBatchIndexerT inp_out_batch_indexer{ + batch_nd, batch_lhs_offset, batch_rhs_offset, batch_res_offset, + batch_shape_and_strides}; + const ReductionIndexerT reduction_indexer{red_nd, reduction_lhs_offset, + reduction_rhs_offset, + reduction_shape_stride}; + + sycl::event dot_ev = + sequential_dot_product( + exec_q, lhs_tp, rhs_tp, res_tp, batches, reduction_nelems, + inp_out_batch_indexer, reduction_indexer, depends); + + return dot_ev; + } + else { + sycl::event res_init_ev = exec_q.submit([&](sycl::handler &cgh) { + using IndexerT = + dpctl::tensor::offset_utils::UnpackedStridedIndexer; + + const ssize_t *const &res_shape = batch_shape_and_strides; + const ssize_t *const &res_strides = + batch_shape_and_strides + 3 * batch_nd; + const IndexerT res_indexer(batch_nd, batch_res_offset, res_shape, + res_strides); + using InitKernelName = + class dot_product_init_krn; + cgh.depends_on(depends); + + cgh.parallel_for( + sycl::range<1>(batches), [=](sycl::id<1> id) { + auto res_offset = res_indexer(id[0]); + res_tp[res_offset] = 0; + }); + }); + + using ReductionOpT = sycl::plus; + + using BatchIndexerT = + dpctl::tensor::offset_utils::ThreeOffsets_StridedIndexer; + using ReductionIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer; + + const BatchIndexerT batch_indexer{batch_nd, batch_lhs_offset, + batch_rhs_offset, batch_res_offset, + batch_shape_and_strides}; + const ReductionIndexerT reduction_indexer{red_nd, reduction_lhs_offset, + reduction_rhs_offset, + reduction_shape_stride}; + + static constexpr std::size_t preferred_reductions_per_wi = + 4; // determined experimentally + std::size_t reductions_per_wi = + (reduction_nelems < preferred_reductions_per_wi * wg) + ? std::max(1, (reduction_nelems + wg - 1) / wg) + : preferred_reductions_per_wi; + + std::size_t reduction_groups = + (reduction_nelems + reductions_per_wi * wg - 1) / + (reductions_per_wi * wg); + + sycl::event dot_ev = + submit_atomic_dot_product( + exec_q, lhs_tp, rhs_tp, res_tp, wg, batches, reduction_nelems, + reductions_per_wi, reduction_groups, batch_indexer, + reduction_indexer, {res_init_ev}); + + return dot_ev; + } +} + +typedef sycl::event (*dot_product_contig_impl_fn_ptr_t)( + sycl::queue &, + std::size_t, + std::size_t, + const char *, + const char *, + char *, + ssize_t, + ssize_t, + ssize_t, + ssize_t, + ssize_t, + const std::vector &); + +template +sycl::event + dot_product_contig_impl(sycl::queue &exec_q, + std::size_t batches, + std::size_t reduction_nelems, + const char *lhs_cp, + const char *rhs_cp, + char *res_cp, + ssize_t batch_lhs_offset, + ssize_t batch_rhs_offset, + ssize_t batch_res_offset, + ssize_t reduction_lhs_offset, + ssize_t reduction_rhs_offset, + const std::vector &depends = {}) +{ + const lhsTy *lhs_tp = reinterpret_cast(lhs_cp) + + batch_lhs_offset + reduction_lhs_offset; + const rhsTy *rhs_tp = reinterpret_cast(rhs_cp) + + batch_rhs_offset + reduction_rhs_offset; + resTy *res_tp = reinterpret_cast(res_cp) + batch_res_offset; + + const sycl::device &d = exec_q.get_device(); + const auto &sg_sizes = d.get_info(); + std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); + + if (reduction_nelems < wg) { + using InputBatchIndexerT = + dpctl::tensor::offset_utils::Strided1DIndexer; + using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using InputOutputBatchIndexerT = + dpctl::tensor::offset_utils::ThreeOffsets_CombinedIndexer< + InputBatchIndexerT, InputBatchIndexerT, NoOpIndexerT>; + using ReductionIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + NoOpIndexerT, NoOpIndexerT>; + + const InputBatchIndexerT inp_batch_indexer{/* size */ batches, + /* step */ reduction_nelems}; + const InputOutputBatchIndexerT inp_out_batch_indexer{ + inp_batch_indexer, inp_batch_indexer, NoOpIndexerT{}}; + static constexpr ReductionIndexerT reduction_indexer{NoOpIndexerT{}, + NoOpIndexerT{}}; + + sycl::event dot_ev = + sequential_dot_product( + exec_q, lhs_tp, rhs_tp, res_tp, batches, reduction_nelems, + inp_out_batch_indexer, reduction_indexer, depends); + + return dot_ev; + } + else { + sycl::event res_init_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + cgh.fill(res_tp, resTy(0), batches); + }); + + using ReductionOpT = sycl::plus; + + using InputBatchIndexerT = + dpctl::tensor::offset_utils::Strided1DIndexer; + using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using InputOutputBatchIndexerT = + dpctl::tensor::offset_utils::ThreeOffsets_CombinedIndexer< + InputBatchIndexerT, InputBatchIndexerT, NoOpIndexerT>; + using ReductionIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + NoOpIndexerT, NoOpIndexerT>; + + const InputBatchIndexerT inp_batch_indexer{/* size */ batches, + /* step */ reduction_nelems}; + const InputOutputBatchIndexerT inp_out_batch_indexer{ + inp_batch_indexer, inp_batch_indexer, NoOpIndexerT{}}; + static constexpr ReductionIndexerT reduction_indexer{NoOpIndexerT{}, + NoOpIndexerT{}}; + + static constexpr std::size_t preferred_reductions_per_wi = + 4; // determined experimentally + std::size_t reductions_per_wi = + (reduction_nelems < preferred_reductions_per_wi * wg) + ? std::max(1, (reduction_nelems + wg - 1) / wg) + : preferred_reductions_per_wi; + + std::size_t reduction_groups = + (reduction_nelems + reductions_per_wi * wg - 1) / + (reductions_per_wi * wg); + + sycl::event dot_ev = + submit_atomic_dot_product( + exec_q, lhs_tp, rhs_tp, res_tp, wg, batches, reduction_nelems, + reductions_per_wi, reduction_groups, inp_out_batch_indexer, + reduction_indexer, {res_init_ev}); + + return dot_ev; + } +} + +template +struct DotProductNoAtomicFunctor +{ +private: + const lhsT *lhs_ = nullptr; + const rhsT *rhs_ = nullptr; + outT *out_ = nullptr; + ReductionOpT reduction_op_; + BatchIndexerT batch_indexer_; + RedIndexerT reduced_dims_indexer_; + std::size_t reduction_max_gid_ = 0; + std::size_t batches_ = 1; + std::size_t reductions_per_wi = 16; + +public: + DotProductNoAtomicFunctor(const lhsT *lhs, + const rhsT *rhs, + outT *res, + const ReductionOpT &reduction_op, + const BatchIndexerT &batch_indexer, + const RedIndexerT &arg_reduced_dims_indexer, + std::size_t reduction_size, + std::size_t iteration_size, + std::size_t reduction_size_per_wi) + : lhs_(lhs), rhs_(rhs), out_(res), reduction_op_(reduction_op), + batch_indexer_(batch_indexer), + reduced_dims_indexer_(arg_reduced_dims_indexer), + reduction_max_gid_(reduction_size), batches_(iteration_size), + reductions_per_wi(reduction_size_per_wi) + { + } + + void operator()(sycl::nd_item<1> it) const + { + const std::size_t reduction_lid = it.get_local_id(0); + const std::size_t wg = + it.get_local_range(0); // 0 <= reduction_lid < wg + + const std::size_t batch_id = it.get_group(0) % batches_; + const std::size_t reduction_batch_id = it.get_group(0) / batches_; + const std::size_t n_reduction_groups = it.get_group_range(0) / batches_; + + // work-items operate over input with indices + // inp_data_id = reduction_batch_id * wg * reductions_per_wi + m * wg + // + reduction_lid + // for 0 <= m < reductions_per_wi + // for each input + + const auto &batch_offsets_ = batch_indexer_(batch_id); + const auto &lhs_batch_offset = batch_offsets_.get_first_offset(); + const auto &rhs_batch_offset = batch_offsets_.get_second_offset(); + const auto &out_batch_offset = batch_offsets_.get_third_offset(); + + outT local_red_val(0); + std::size_t arg_reduce_gid0 = + reduction_lid + reduction_batch_id * wg * reductions_per_wi; + std::size_t arg_reduce_gid_max = std::min( + reduction_max_gid_, arg_reduce_gid0 + reductions_per_wi * wg); + + for (std::size_t arg_reduce_gid = arg_reduce_gid0; + arg_reduce_gid < arg_reduce_gid_max; arg_reduce_gid += wg) { + auto reduction_offsets_ = reduced_dims_indexer_(arg_reduce_gid); + const auto &lhs_reduction_offset = + reduction_offsets_.get_first_offset(); + const auto &rhs_reduction_offset = + reduction_offsets_.get_second_offset(); + + using dpctl::tensor::type_utils::convert_impl; + outT val = convert_impl( + lhs_[lhs_batch_offset + lhs_reduction_offset]) * + convert_impl( + rhs_[rhs_batch_offset + rhs_reduction_offset]); + + local_red_val += val; + } + + auto work_group = it.get_group(); + + using RedOpT = typename std::conditional, + sycl::logical_or, + sycl::plus>::type; + outT red_val_over_wg = sycl::reduce_over_group( + work_group, local_red_val, outT(0), RedOpT()); + + if (work_group.leader()) { + // each group writes to a different memory location + out_[out_batch_offset * n_reduction_groups + reduction_batch_id] = + red_val_over_wg; + } + } +}; + +template +struct DotProductNoAtomicCustomFunctor +{ +private: + const lhsT *lhs_ = nullptr; + const rhsT *rhs_ = nullptr; + outT *out_ = nullptr; + ReductionOpT reduction_op_; + BatchIndexerT batch_indexer_; + RedIndexerT reduced_dims_indexer_; + SlmT local_mem_; + std::size_t reduction_max_gid_ = 0; + std::size_t batches_ = 1; + std::size_t reductions_per_wi = 16; + +public: + DotProductNoAtomicCustomFunctor(const lhsT *lhs, + const rhsT *rhs, + outT *res, + const ReductionOpT &reduction_op, + const BatchIndexerT &batch_indexer, + const RedIndexerT &arg_reduced_dims_indexer, + SlmT local_mem, + std::size_t reduction_size, + std::size_t iteration_size, + std::size_t reduction_size_per_wi) + : lhs_(lhs), rhs_(rhs), out_(res), reduction_op_(reduction_op), + batch_indexer_(batch_indexer), + reduced_dims_indexer_(arg_reduced_dims_indexer), + local_mem_(local_mem), reduction_max_gid_(reduction_size), + batches_(iteration_size), reductions_per_wi(reduction_size_per_wi) + { + } + + void operator()(sycl::nd_item<1> it) const + { + const std::size_t reduction_lid = it.get_local_id(0); + const std::size_t wg = + it.get_local_range(0); // 0 <= reduction_lid < wg + + const std::size_t batch_id = it.get_group(0) % batches_; + const std::size_t reduction_batch_id = it.get_group(0) / batches_; + const std::size_t n_reduction_groups = it.get_group_range(0) / batches_; + + // work-items operate over input with indices + // inp_data_id = reduction_batch_id * wg * reductions_per_wi + m * wg + // + reduction_lid + // for 0 <= m < reductions_per_wi + // for each input + + const auto &batch_offsets_ = batch_indexer_(batch_id); + const auto &lhs_batch_offset = batch_offsets_.get_first_offset(); + const auto &rhs_batch_offset = batch_offsets_.get_second_offset(); + const auto &out_batch_offset = batch_offsets_.get_third_offset(); + + outT local_red_val(0); + std::size_t arg_reduce_gid0 = + reduction_lid + reduction_batch_id * wg * reductions_per_wi; + std::size_t arg_reduce_gid_max = std::min( + reduction_max_gid_, arg_reduce_gid0 + reductions_per_wi * wg); + + for (std::size_t arg_reduce_gid = arg_reduce_gid0; + arg_reduce_gid < arg_reduce_gid_max; arg_reduce_gid += wg) { + auto reduction_offsets_ = reduced_dims_indexer_(arg_reduce_gid); + const auto &lhs_reduction_offset = + reduction_offsets_.get_first_offset(); + const auto &rhs_reduction_offset = + reduction_offsets_.get_second_offset(); + + using dpctl::tensor::type_utils::convert_impl; + outT val = convert_impl( + lhs_[lhs_batch_offset + lhs_reduction_offset]) * + convert_impl( + rhs_[rhs_batch_offset + rhs_reduction_offset]); + + local_red_val += val; + } + + auto work_group = it.get_group(); + + outT red_val_over_wg = su_ns::custom_reduce_over_group( + work_group, local_mem_, local_red_val, reduction_op_); + + if (work_group.leader()) { + // each group writes to a different memory location + out_[out_batch_offset * n_reduction_groups + reduction_batch_id] = + red_val_over_wg; + } + } +}; + +template class kernel_name_token> +sycl::event + submit_no_atomic_dot_product(sycl::queue &exec_q, + const lhsTy *lhs, + const rhsTy *rhs, + resTy *res, + std::size_t wg, + std::size_t batches, + std::size_t reduction_nelems, + std::size_t reductions_per_wi, + std::size_t reduction_groups, + const BatchIndexerT &batch_indexer, + const RedIndexerT &reduction_indexer, + const std::vector &depends) +{ + sycl::event dot_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + auto globalRange = sycl::range<1>{batches * reduction_groups * wg}; + auto localRange = sycl::range<1>{wg}; + auto ndRange = sycl::nd_range<1>(globalRange, localRange); + + if constexpr (can_use_reduce_over_group::value) { + using KernelName = + class kernel_name_token; + + cgh.parallel_for( + ndRange, + DotProductNoAtomicFunctor( + lhs, rhs, res, ReductionOpT(), batch_indexer, + reduction_indexer, reduction_nelems, batches, + reductions_per_wi)); + } + else { + using SlmT = sycl::local_accessor; + SlmT local_memory = SlmT(localRange, cgh); + + using KernelName = class custom_reduction_wrapper>; + + cgh.parallel_for( + ndRange, + DotProductNoAtomicCustomFunctor( + lhs, rhs, res, ReductionOpT(), batch_indexer, + reduction_indexer, local_memory, reduction_nelems, batches, + reductions_per_wi)); + } + }); + return dot_ev; +} + +template +class dot_product_tree_krn; + +template +class dot_product_tree_reduction_krn; + +template +sycl::event dot_product_tree_impl(sycl::queue &exec_q, + std::size_t batches, + std::size_t reduction_nelems, + const char *lhs_cp, + const char *rhs_cp, + char *res_cp, + int batch_nd, + const ssize_t *batch_shape_and_strides, + ssize_t batch_lhs_offset, + ssize_t batch_rhs_offset, + ssize_t batch_res_offset, + int red_nd, + const ssize_t *reduction_shape_stride, + ssize_t reduction_lhs_offset, + ssize_t reduction_rhs_offset, + const std::vector &depends = {}) +{ + const lhsTy *lhs_tp = reinterpret_cast(lhs_cp); + const rhsTy *rhs_tp = reinterpret_cast(rhs_cp); + resTy *res_tp = reinterpret_cast(res_cp); + + const sycl::device &d = exec_q.get_device(); + const auto &sg_sizes = d.get_info(); + std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); + + if (reduction_nelems < wg) { + using InputOutputBatchIndexerT = + dpctl::tensor::offset_utils::ThreeOffsets_StridedIndexer; + using ReductionIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer; + + const InputOutputBatchIndexerT inp_out_batch_indexer{ + batch_nd, batch_lhs_offset, batch_rhs_offset, batch_res_offset, + batch_shape_and_strides}; + const ReductionIndexerT reduction_indexer{red_nd, reduction_lhs_offset, + reduction_rhs_offset, + reduction_shape_stride}; + + sycl::event dot_ev = + sequential_dot_product( + exec_q, lhs_tp, rhs_tp, res_tp, batches, reduction_nelems, + inp_out_batch_indexer, reduction_indexer, depends); + + return dot_ev; + } + + static constexpr std::size_t preferred_reductions_per_wi = 8; + // prevents running out of resources on CPU + std::size_t max_wg = reduction_detail::get_work_group_size(d); + + using ReductionOpT = typename std::conditional, + sycl::logical_or, + sycl::plus>::type; + + std::size_t reductions_per_wi(preferred_reductions_per_wi); + if (reduction_nelems <= preferred_reductions_per_wi * max_wg) { + using BatchIndexerT = + dpctl::tensor::offset_utils::ThreeOffsets_StridedIndexer; + using ReductionIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer; + + const BatchIndexerT batch_indexer{batch_nd, batch_lhs_offset, + batch_rhs_offset, batch_res_offset, + batch_shape_and_strides}; + const ReductionIndexerT reduction_indexer{red_nd, reduction_lhs_offset, + reduction_rhs_offset, + reduction_shape_stride}; + + if (batches == 1) { + // increase GPU occupancy + wg = max_wg; + } + reductions_per_wi = + std::max(1, (reduction_nelems + wg - 1) / wg); + + std::size_t reduction_groups = + (reduction_nelems + reductions_per_wi * wg - 1) / + (reductions_per_wi * wg); + assert(reduction_groups == 1); + + sycl::event dot_ev = + submit_no_atomic_dot_product( + exec_q, lhs_tp, rhs_tp, res_tp, wg, batches, reduction_nelems, + reductions_per_wi, reduction_groups, batch_indexer, + reduction_indexer, depends); + + return dot_ev; + } + else { + static constexpr resTy identity_val = + sycl::known_identity::value; + + // more than one work-groups is needed, requires a temporary + std::size_t reduction_groups = + (reduction_nelems + preferred_reductions_per_wi * wg - 1) / + (preferred_reductions_per_wi * wg); + assert(reduction_groups > 1); + + std::size_t second_iter_reduction_groups_ = + (reduction_groups + preferred_reductions_per_wi * wg - 1) / + (preferred_reductions_per_wi * wg); + + // returns unique_ptr + auto partially_reduced_tmp_owner = + dpctl::tensor::alloc_utils::smart_malloc_device( + batches * (reduction_groups + second_iter_reduction_groups_), + exec_q); + + resTy *partially_reduced_tmp = partially_reduced_tmp_owner.get(); + resTy *partially_reduced_tmp2 = + partially_reduced_tmp + reduction_groups * batches; + + sycl::event first_reduction_ev; + { + using LhsIndexerT = dpctl::tensor::offset_utils::StridedIndexer; + using RhsIndexerT = + dpctl::tensor::offset_utils::UnpackedStridedIndexer; + using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using InputOutputBatchIndexerT = + dpctl::tensor::offset_utils::ThreeOffsets_CombinedIndexer< + LhsIndexerT, RhsIndexerT, ResIndexerT>; + using ReductionIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer; + + const LhsIndexerT lhs_indexer(batch_nd, batch_lhs_offset, + batch_shape_and_strides); + const RhsIndexerT rhs_indexer( + batch_nd, batch_rhs_offset, batch_shape_and_strides, + batch_shape_and_strides + 2 * batch_nd); + static constexpr ResIndexerT noop_tmp_indexer{}; + + const InputOutputBatchIndexerT in_out_iter_indexer{ + lhs_indexer, rhs_indexer, noop_tmp_indexer}; + const ReductionIndexerT reduction_indexer{ + red_nd, reduction_lhs_offset, reduction_rhs_offset, + reduction_shape_stride}; + + first_reduction_ev = submit_no_atomic_dot_product< + lhsTy, rhsTy, resTy, ReductionOpT, InputOutputBatchIndexerT, + ReductionIndexerT, dot_product_tree_krn>( + exec_q, lhs_tp, rhs_tp, partially_reduced_tmp, wg, batches, + reduction_nelems, preferred_reductions_per_wi, reduction_groups, + in_out_iter_indexer, reduction_indexer, depends); + } + + std::size_t remaining_reduction_nelems = reduction_groups; + + resTy *temp_arg = partially_reduced_tmp; + resTy *temp2_arg = partially_reduced_tmp2; + sycl::event dependent_ev = first_reduction_ev; + + while (remaining_reduction_nelems > + preferred_reductions_per_wi * max_wg) { + std::size_t reduction_groups_ = + (remaining_reduction_nelems + preferred_reductions_per_wi * wg - + 1) / + (preferred_reductions_per_wi * wg); + assert(reduction_groups_ > 1); + + using InputIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer; + using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + InputIndexerT, ResIndexerT>; + using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + + const InputIndexerT inp_indexer{/* size */ batches, + /* step */ reduction_groups_}; + static constexpr ResIndexerT res_iter_indexer{}; + + const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer, + res_iter_indexer}; + static constexpr ReductionIndexerT reduction_indexer{}; + + sycl::event partial_reduction_ev = + dpctl::tensor::kernels::submit_no_atomic_reduction< + resTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT, dot_product_tree_reduction_krn>( + exec_q, temp_arg, temp2_arg, identity_val, wg, batches, + remaining_reduction_nelems, preferred_reductions_per_wi, + reduction_groups_, in_out_iter_indexer, reduction_indexer, + {dependent_ev}); + + remaining_reduction_nelems = reduction_groups_; + std::swap(temp_arg, temp2_arg); + dependent_ev = std::move(partial_reduction_ev); + } + + // final reduction to res + using InputIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer; + using ResIndexerT = dpctl::tensor::offset_utils::UnpackedStridedIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + InputIndexerT, ResIndexerT>; + using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + + const InputIndexerT inp_indexer{/* size */ batches, + /* step */ remaining_reduction_nelems}; + const ResIndexerT res_iter_indexer{ + batch_nd, batch_res_offset, + /* shape */ batch_shape_and_strides, + /* strides */ batch_shape_and_strides + 2 * batch_nd}; + + const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer, + res_iter_indexer}; + static constexpr ReductionIndexerT reduction_indexer{}; + + wg = max_wg; + reductions_per_wi = std::max( + 1, (remaining_reduction_nelems + wg - 1) / wg); + + reduction_groups = + (remaining_reduction_nelems + reductions_per_wi * wg - 1) / + (reductions_per_wi * wg); + assert(reduction_groups == 1); + + sycl::event final_reduction_ev = + dpctl::tensor::kernels::submit_no_atomic_reduction< + resTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT, dot_product_tree_reduction_krn>( + exec_q, temp_arg, res_tp, identity_val, wg, batches, + remaining_reduction_nelems, reductions_per_wi, reduction_groups, + in_out_iter_indexer, reduction_indexer, {dependent_ev}); + + // transfer ownership of USM allocation to host_task + sycl::event cleanup_host_task_event = + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {final_reduction_ev}, partially_reduced_tmp_owner); + + return cleanup_host_task_event; + } +} + +template +sycl::event + dot_product_contig_tree_impl(sycl::queue &exec_q, + std::size_t batches, + std::size_t reduction_nelems, + const char *lhs_cp, + const char *rhs_cp, + char *res_cp, + ssize_t batch_lhs_offset, + ssize_t batch_rhs_offset, + ssize_t batch_res_offset, + ssize_t reduction_lhs_offset, + ssize_t reduction_rhs_offset, + const std::vector &depends = {}) +{ + const lhsTy *lhs_tp = reinterpret_cast(lhs_cp) + + batch_lhs_offset + reduction_lhs_offset; + const rhsTy *rhs_tp = reinterpret_cast(rhs_cp) + + batch_rhs_offset + reduction_rhs_offset; + resTy *res_tp = reinterpret_cast(res_cp) + batch_res_offset; + + const sycl::device &d = exec_q.get_device(); + const auto &sg_sizes = d.get_info(); + std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); + + if (reduction_nelems < wg) { + using InputBatchIndexerT = + dpctl::tensor::offset_utils::Strided1DIndexer; + using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using InputOutputBatchIndexerT = + dpctl::tensor::offset_utils::ThreeOffsets_CombinedIndexer< + InputBatchIndexerT, InputBatchIndexerT, NoOpIndexerT>; + using ReductionIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + NoOpIndexerT, NoOpIndexerT>; + + const InputBatchIndexerT inp_batch_indexer{/* size */ batches, + /* step */ reduction_nelems}; + const InputOutputBatchIndexerT inp_out_batch_indexer{ + inp_batch_indexer, inp_batch_indexer, NoOpIndexerT{}}; + static constexpr ReductionIndexerT reduction_indexer{NoOpIndexerT{}, + NoOpIndexerT{}}; + + sycl::event dot_ev = + sequential_dot_product( + exec_q, lhs_tp, rhs_tp, res_tp, batches, reduction_nelems, + inp_out_batch_indexer, reduction_indexer, depends); + + return dot_ev; + } + + static constexpr std::size_t preferred_reductions_per_wi = 8; + // prevents running out of resources on CPU + std::size_t max_wg = reduction_detail::get_work_group_size(d); + + using ReductionOpT = typename std::conditional, + sycl::logical_or, + sycl::plus>::type; + + std::size_t reductions_per_wi(preferred_reductions_per_wi); + if (reduction_nelems <= preferred_reductions_per_wi * max_wg) { + using InputBatchIndexerT = + dpctl::tensor::offset_utils::Strided1DIndexer; + using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using InputOutputBatchIndexerT = + dpctl::tensor::offset_utils::ThreeOffsets_CombinedIndexer< + InputBatchIndexerT, InputBatchIndexerT, NoOpIndexerT>; + using ReductionIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + NoOpIndexerT, NoOpIndexerT>; + + const InputBatchIndexerT inp_batch_indexer{/* size */ batches, + /* step */ reduction_nelems}; + const InputOutputBatchIndexerT inp_out_batch_indexer{ + inp_batch_indexer, inp_batch_indexer, NoOpIndexerT{}}; + static constexpr ReductionIndexerT reduction_indexer{NoOpIndexerT{}, + NoOpIndexerT{}}; + + if (batches == 1) { + // increase GPU occupancy + wg = max_wg; + } + reductions_per_wi = + std::max(1, (reduction_nelems + wg - 1) / wg); + + std::size_t reduction_groups = + (reduction_nelems + reductions_per_wi * wg - 1) / + (reductions_per_wi * wg); + assert(reduction_groups == 1); + + sycl::event dot_ev = submit_no_atomic_dot_product< + lhsTy, rhsTy, resTy, ReductionOpT, InputOutputBatchIndexerT, + ReductionIndexerT, dot_product_tree_krn>( + exec_q, lhs_tp, rhs_tp, res_tp, wg, batches, reduction_nelems, + reductions_per_wi, reduction_groups, inp_out_batch_indexer, + reduction_indexer, depends); + + return dot_ev; + } + else { + static constexpr resTy identity_val = + sycl::known_identity::value; + + // more than one work-groups is needed, requires a temporary + std::size_t reduction_groups = + (reduction_nelems + preferred_reductions_per_wi * wg - 1) / + (preferred_reductions_per_wi * wg); + assert(reduction_groups > 1); + + std::size_t second_iter_reduction_groups_ = + (reduction_groups + preferred_reductions_per_wi * wg - 1) / + (preferred_reductions_per_wi * wg); + + // unique_ptr that owns temporary allocation for partial reductions + auto partially_reduced_tmp_owner = + dpctl::tensor::alloc_utils::smart_malloc_device( + batches * (reduction_groups + second_iter_reduction_groups_), + exec_q); + // get raw pointers + resTy *partially_reduced_tmp = partially_reduced_tmp_owner.get(); + resTy *partially_reduced_tmp2 = + partially_reduced_tmp + reduction_groups * batches; + + sycl::event first_reduction_ev; + { + using InputBatchIndexerT = + dpctl::tensor::offset_utils::Strided1DIndexer; + using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using InputOutputBatchIndexerT = + dpctl::tensor::offset_utils::ThreeOffsets_CombinedIndexer< + InputBatchIndexerT, InputBatchIndexerT, NoOpIndexerT>; + using ReductionIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + NoOpIndexerT, NoOpIndexerT>; + + const InputBatchIndexerT inp_batch_indexer{ + /* size */ batches, + /* step */ reduction_nelems}; + const InputOutputBatchIndexerT inp_out_batch_indexer{ + inp_batch_indexer, inp_batch_indexer, NoOpIndexerT{}}; + static constexpr ReductionIndexerT reduction_indexer{ + NoOpIndexerT{}, NoOpIndexerT{}}; + + first_reduction_ev = submit_no_atomic_dot_product< + lhsTy, rhsTy, resTy, ReductionOpT, InputOutputBatchIndexerT, + ReductionIndexerT, dot_product_tree_krn>( + exec_q, lhs_tp, rhs_tp, partially_reduced_tmp, wg, batches, + reduction_nelems, preferred_reductions_per_wi, reduction_groups, + inp_out_batch_indexer, reduction_indexer, depends); + } + + std::size_t remaining_reduction_nelems = reduction_groups; + + resTy *temp_arg = partially_reduced_tmp; + resTy *temp2_arg = partially_reduced_tmp2; + sycl::event dependent_ev = first_reduction_ev; + + while (remaining_reduction_nelems > + preferred_reductions_per_wi * max_wg) { + std::size_t reduction_groups_ = + (remaining_reduction_nelems + preferred_reductions_per_wi * wg - + 1) / + (preferred_reductions_per_wi * wg); + assert(reduction_groups_ > 1); + + using InputIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer; + using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + InputIndexerT, ResIndexerT>; + using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + + const InputIndexerT inp_indexer{/* size */ batches, + /* step */ reduction_groups_}; + static constexpr ResIndexerT res_iter_indexer{}; + + const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer, + res_iter_indexer}; + static constexpr ReductionIndexerT reduction_indexer{}; + + sycl::event partial_reduction_ev = + dpctl::tensor::kernels::submit_no_atomic_reduction< + resTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT, dot_product_tree_reduction_krn>( + exec_q, temp_arg, temp2_arg, identity_val, wg, batches, + remaining_reduction_nelems, preferred_reductions_per_wi, + reduction_groups_, in_out_iter_indexer, reduction_indexer, + {dependent_ev}); + + remaining_reduction_nelems = reduction_groups_; + std::swap(temp_arg, temp2_arg); + dependent_ev = std::move(partial_reduction_ev); + } + + // final reduction to res + using InputIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer; + using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + InputIndexerT, ResIndexerT>; + using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + + const InputIndexerT inp_indexer{/* size */ batches, + /* step */ remaining_reduction_nelems}; + static constexpr ResIndexerT res_iter_indexer{}; + + const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer, + res_iter_indexer}; + static constexpr ReductionIndexerT reduction_indexer{}; + + wg = max_wg; + reductions_per_wi = std::max( + 1, (remaining_reduction_nelems + wg - 1) / wg); + + reduction_groups = + (remaining_reduction_nelems + reductions_per_wi * wg - 1) / + (reductions_per_wi * wg); + assert(reduction_groups == 1); + + sycl::event final_reduction_ev = + dpctl::tensor::kernels::submit_no_atomic_reduction< + resTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT, dot_product_tree_reduction_krn>( + exec_q, temp_arg, res_tp, identity_val, wg, batches, + remaining_reduction_nelems, reductions_per_wi, reduction_groups, + in_out_iter_indexer, reduction_indexer, {dependent_ev}); + + sycl::event cleanup_host_task_event = + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {final_reduction_ev}, partially_reduced_tmp_owner); + + return cleanup_host_task_event; + } +} + +} // namespace dpctl::tensor::kernels diff --git a/dpnp/tensor/libtensor/include/kernels/linalg_functions/gemm.hpp b/dpnp/tensor/libtensor/include/kernels/linalg_functions/gemm.hpp new file mode 100644 index 000000000000..5644ea172a1d --- /dev/null +++ b/dpnp/tensor/libtensor/include/kernels/linalg_functions/gemm.hpp @@ -0,0 +1,4233 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for general matrix multiplication (GEMM). +//===---------------------------------------------------------------------===// + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "kernels/dpctl_tensor_types.hpp" +#include "kernels/reductions.hpp" +#include "utils/offset_utils.hpp" +#include "utils/sycl_alloc_utils.hpp" +#include "utils/sycl_utils.hpp" +#include "utils/type_utils.hpp" + +namespace dpctl::tensor::kernels +{ + +using dpctl::tensor::ssize_t; + +namespace gemm_detail +{ + +template +void scale_gemm_k_parameters(const std::size_t &local_mem_size, + const std::size_t &reserved_slm_size, + const std::size_t delta_k, + std::size_t &n_wi, + std::size_t &delta_n) +{ + static constexpr std::size_t slm_elem_size = sizeof(T) * m_groups; + + while (slm_elem_size * (n_wi + delta_n) * delta_k + reserved_slm_size >= + local_mem_size) { + n_wi = n_wi / 2; + delta_n = delta_n / 2; + if (delta_n == 0) + throw std::runtime_error("Insufficient resources"); + } +} + +template +void scale_gemm_nm_parameters(const std::size_t &local_mem_size, + const std::size_t &reserved_slm_size, + const std::size_t &wi_delta_n, + std::size_t &wi_delta_k, + std::size_t &wg_delta_n, + std::size_t &wg_delta_m) +{ + static constexpr std::size_t slm_A_elem_size = sizeof(T); + static constexpr std::size_t slm_B_elem_size = sizeof(T) * wi_delta_m; + + while ((wi_delta_n * wg_delta_n * wi_delta_k * slm_A_elem_size) + + (wi_delta_k * wg_delta_m * slm_B_elem_size) + + reserved_slm_size >= + local_mem_size) { + wg_delta_n /= 2; + wg_delta_m /= 2; + wi_delta_k /= 2; + if (wg_delta_n == 0) + throw std::runtime_error("Insufficient resources"); + } +} +} // namespace gemm_detail + +using dpctl::tensor::sycl_utils::choose_workgroup_size; + +template +class gemm_seq_reduction_krn; + +template +class gemm_tree_reduction_krn; + +template +sycl::event single_reduction_for_gemm(sycl::queue &exec_q, + T *tmp_tp, + T *res_tp, + T identity_val, + std::size_t iter_nelems, + std::size_t reduction_nelems, + std::size_t reduction_groups, + std::size_t wg, + std::size_t max_wg, + std::size_t preferred_reductions_per_wi, + std::size_t reductions_per_wi, + int res_nd, + ssize_t res_offset, + const ssize_t *res_shapes_strides, + const std::vector &depends) +{ + sycl::event red_ev; + if (reduction_nelems < wg) { + using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using ResIndexerT = dpctl::tensor::offset_utils::StridedIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + NoOpIndexerT, ResIndexerT>; + using ReductionIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer; + + const ResIndexerT res_iter_indexer{res_nd, 0, res_shapes_strides}; + const InputOutputIterIndexerT in_out_iter_indexer{NoOpIndexerT{}, + res_iter_indexer}; + const ReductionIndexerT reduction_indexer{/* size */ reduction_nelems, + /* step */ iter_nelems}; + + red_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + sycl::range<1> iter_range{iter_nelems}; + + cgh.parallel_for>( + iter_range, + SequentialReduction( + tmp_tp, res_tp, ReductionOpT(), identity_val, + in_out_iter_indexer, reduction_indexer, reduction_nelems)); + }); + } + else { + using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using ResIndexerT = dpctl::tensor::offset_utils::StridedIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + NoOpIndexerT, ResIndexerT>; + using ReductionIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer; + + const ResIndexerT res_iter_indexer{res_nd, 0, res_shapes_strides}; + const InputOutputIterIndexerT in_out_iter_indexer{NoOpIndexerT{}, + res_iter_indexer}; + const ReductionIndexerT reduction_indexer{/* size */ reduction_nelems, + /* step */ iter_nelems}; + + if (iter_nelems == 1) { + // increase GPU occupancy + wg = max_wg; + } + reductions_per_wi = + std::max(1, (reduction_nelems + wg - 1) / wg); + + std::size_t reduction_groups = + (reduction_nelems + reductions_per_wi * wg - 1) / + (reductions_per_wi * wg); + assert(reduction_groups == 1); + + red_ev = dpctl::tensor::kernels::submit_no_atomic_reduction< + T, T, ReductionOpT, InputOutputIterIndexerT, ReductionIndexerT, + gemm_tree_reduction_krn>( + exec_q, tmp_tp, res_tp, identity_val, wg, iter_nelems, + reduction_nelems, reductions_per_wi, reduction_groups, + in_out_iter_indexer, reduction_indexer, depends); + } + return red_ev; +} + +template +sycl::event + single_reduction_for_gemm_contig(sycl::queue &exec_q, + T *tmp_tp, + T *res_tp, + T identity_val, + std::size_t iter_nelems, + std::size_t reduction_nelems, + std::size_t reduction_groups, + std::size_t wg, + std::size_t max_wg, + std::size_t preferred_reductions_per_wi, + std::size_t reductions_per_wi, + const std::vector &depends) +{ + sycl::event red_ev; + if (reduction_nelems < wg) { + using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + NoOpIndexerT, NoOpIndexerT>; + using ReductionIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer; + + static constexpr InputOutputIterIndexerT in_out_iter_indexer{ + NoOpIndexerT{}, NoOpIndexerT{}}; + // tmp allocation is a C-contiguous matrix (reduction_nelems, + // iter_nelems) and we are reducing by axis 0 + const ReductionIndexerT reduction_indexer{/* size */ reduction_nelems, + /* step */ iter_nelems}; + + red_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + sycl::range<1> iter_range{iter_nelems}; + + cgh.parallel_for>( + iter_range, + SequentialReduction( + tmp_tp, res_tp, ReductionOpT(), identity_val, + in_out_iter_indexer, reduction_indexer, reduction_nelems)); + }); + } + else { + using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + NoOpIndexerT, NoOpIndexerT>; + using ReductionIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer; + + static constexpr InputOutputIterIndexerT in_out_iter_indexer{ + NoOpIndexerT{}, NoOpIndexerT{}}; + // tmp allocation is a C-contiguous matrix + // (reduction_nelems, iter_nelems). Reducing along axis 0 + const ReductionIndexerT reduction_indexer{/* size */ reduction_nelems, + /* step */ iter_nelems}; + + if (iter_nelems == 1) { + // increase GPU occupancy + wg = max_wg; + } + reductions_per_wi = + std::max(1, (reduction_nelems + wg - 1) / wg); + + std::size_t reduction_groups = + (reduction_nelems + reductions_per_wi * wg - 1) / + (reductions_per_wi * wg); + assert(reduction_groups == 1); + + red_ev = dpctl::tensor::kernels::submit_no_atomic_reduction< + T, T, ReductionOpT, InputOutputIterIndexerT, ReductionIndexerT, + gemm_tree_reduction_krn>( + exec_q, tmp_tp, res_tp, identity_val, wg, iter_nelems, + reduction_nelems, reductions_per_wi, reduction_groups, + in_out_iter_indexer, reduction_indexer, depends); + } + return red_ev; +} + +template +sycl::event tree_reduction_for_gemm(sycl::queue &exec_q, + T *partially_reduced_tmp, + T *partially_reduced_tmp2, + T *res_tp, + T identity_val, + std::size_t iter_nelems, + std::size_t reduction_nelems, + std::size_t reduction_groups, + std::size_t wg, + std::size_t max_wg, + std::size_t preferred_reductions_per_wi, + std::size_t reductions_per_wi, + int res_nd, + ssize_t res_offset, + const ssize_t *res_shape_strides, + const std::vector &depends) +{ + sycl::event first_reduction_ev; + { + using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + NoOpIndexerT, NoOpIndexerT>; + using ReductionIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer; + + static constexpr InputOutputIterIndexerT in_out_iter_indexer{ + NoOpIndexerT{}, NoOpIndexerT{}}; + // partially_reduced_tmp is C-contig matrix with shape + // (reduction_nelems, iter_nelems). Reducing along axis 0. + const ReductionIndexerT reduction_indexer{/* size */ reduction_nelems, + /* step */ iter_nelems}; + + first_reduction_ev = dpctl::tensor::kernels::submit_no_atomic_reduction< + T, T, ReductionOpT, InputOutputIterIndexerT, ReductionIndexerT, + gemm_tree_reduction_krn>( + exec_q, partially_reduced_tmp, partially_reduced_tmp2, identity_val, + wg, iter_nelems, reduction_nelems, reductions_per_wi, + reduction_groups, in_out_iter_indexer, reduction_indexer, depends); + } + + std::size_t remaining_reduction_nelems = reduction_groups; + + T *temp_arg = partially_reduced_tmp2; + T *temp2_arg = partially_reduced_tmp; + sycl::event dependent_ev = first_reduction_ev; + + while (remaining_reduction_nelems > preferred_reductions_per_wi * max_wg) { + std::size_t reduction_groups_ = (remaining_reduction_nelems + + preferred_reductions_per_wi * wg - 1) / + (preferred_reductions_per_wi * wg); + assert(reduction_groups_ > 1); + + // keep reducing + using InputIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer; + using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + InputIndexerT, ResIndexerT>; + using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + + const InputIndexerT inp_indexer{/* size */ iter_nelems, + /* step */ reduction_groups_}; + static constexpr ResIndexerT res_iter_indexer{}; + + const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer, + res_iter_indexer}; + + static constexpr ReductionIndexerT reduction_indexer{}; + + sycl::event partial_reduction_ev = + dpctl::tensor::kernels::submit_no_atomic_reduction< + T, T, ReductionOpT, InputOutputIterIndexerT, ReductionIndexerT, + gemm_tree_reduction_krn>( + exec_q, temp_arg, temp2_arg, identity_val, wg, iter_nelems, + remaining_reduction_nelems, reductions_per_wi, + reduction_groups_, in_out_iter_indexer, reduction_indexer, + {dependent_ev}); + + remaining_reduction_nelems = reduction_groups_; + std::swap(temp_arg, temp2_arg); + dependent_ev = std::move(partial_reduction_ev); + } + + // final reduction to res + using InputIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer; + using ResIndexerT = dpctl::tensor::offset_utils::StridedIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer; + using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + + const InputIndexerT inp_indexer{/* size */ iter_nelems, + /* step */ remaining_reduction_nelems}; + const ResIndexerT res_iter_indexer{ + /* ndim */ res_nd, + /* offset */ static_cast(res_offset), + /* packed shape_strides*/ res_shape_strides}; + + const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer, + res_iter_indexer}; + static constexpr ReductionIndexerT reduction_indexer{}; + + wg = max_wg; + reductions_per_wi = + std::max(1, (remaining_reduction_nelems + wg - 1) / wg); + + reduction_groups = + (remaining_reduction_nelems + reductions_per_wi * wg - 1) / + (reductions_per_wi * wg); + assert(reduction_groups == 1); + + sycl::event final_reduction_ev = + dpctl::tensor::kernels::submit_no_atomic_reduction< + T, T, ReductionOpT, InputOutputIterIndexerT, ReductionIndexerT, + gemm_tree_reduction_krn>( + exec_q, temp_arg, res_tp, identity_val, wg, iter_nelems, + remaining_reduction_nelems, reductions_per_wi, reduction_groups, + in_out_iter_indexer, reduction_indexer, {dependent_ev}); + + return final_reduction_ev; +} + +template +class gemm_reduction_over_group_temps_contig_krn; + +template +sycl::event + tree_reduction_for_gemm_contig(sycl::queue &exec_q, + T *partially_reduced_tmp, + T *partially_reduced_tmp2, + T *res_tp, + T identity_val, + std::size_t iter_nelems, + std::size_t reduction_nelems, + std::size_t reduction_groups, + std::size_t wg, + std::size_t max_wg, + std::size_t preferred_reductions_per_wi, + std::size_t reductions_per_wi, + const std::vector &depends) +{ + using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer; + using ReductionIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer; + + static constexpr InputOutputIterIndexerT in_out_iter_indexer{ + NoOpIndexerT{}, NoOpIndexerT{}}; + const ReductionIndexerT reduction_indexer{/* size */ reduction_nelems, + /* step */ iter_nelems}; + + const sycl::event &first_reduction_ev = + dpctl::tensor::kernels::submit_no_atomic_reduction< + T, T, ReductionOpT, InputOutputIterIndexerT, ReductionIndexerT, + gemm_reduction_over_group_temps_contig_krn>( + exec_q, partially_reduced_tmp, partially_reduced_tmp2, identity_val, + wg, iter_nelems, reduction_nelems, reductions_per_wi, + reduction_groups, in_out_iter_indexer, reduction_indexer, depends); + + std::size_t remaining_reduction_nelems = reduction_groups; + + T *temp_arg = partially_reduced_tmp2; + T *temp2_arg = partially_reduced_tmp; + sycl::event dependent_ev = first_reduction_ev; + + while (remaining_reduction_nelems > preferred_reductions_per_wi * max_wg) { + std::size_t reduction_groups_ = (remaining_reduction_nelems + + preferred_reductions_per_wi * wg - 1) / + (preferred_reductions_per_wi * wg); + assert(reduction_groups_ > 1); + + // keep reducing + using InputIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer; + using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + InputIndexerT, ResIndexerT>; + using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + + // n * m = iter_nelems because essentially, this process + // creates a stack of reduction_nelems 2D matrices and we reduce + // along the stack axis + const InputIndexerT inp_indexer{/* size */ iter_nelems, + /* step */ reduction_groups_}; + static constexpr ResIndexerT res_iter_indexer{}; + + const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer, + res_iter_indexer}; + + static constexpr ReductionIndexerT reduction_indexer{}; + + sycl::event partial_reduction_ev = + dpctl::tensor::kernels::submit_no_atomic_reduction< + T, T, ReductionOpT, InputOutputIterIndexerT, ReductionIndexerT, + gemm_reduction_over_group_temps_contig_krn>( + exec_q, temp_arg, temp2_arg, identity_val, wg, iter_nelems, + remaining_reduction_nelems, reductions_per_wi, + reduction_groups_, in_out_iter_indexer, reduction_indexer, + {dependent_ev}); + + remaining_reduction_nelems = reduction_groups_; + std::swap(temp_arg, temp2_arg); + dependent_ev = std::move(partial_reduction_ev); + } + + // final reduction to res + { + using InputIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer; + using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + InputIndexerT, ResIndexerT>; + using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + + const InputIndexerT inp_indexer{ + /* size */ iter_nelems, + /* step */ remaining_reduction_nelems}; + static constexpr ResIndexerT res_iter_indexer{}; + + const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer, + res_iter_indexer}; + static constexpr ReductionIndexerT reduction_indexer{}; + + wg = max_wg; + reductions_per_wi = std::max( + 1, (remaining_reduction_nelems + wg - 1) / wg); + + std::size_t reduction_groups = + (remaining_reduction_nelems + reductions_per_wi * wg - 1) / + (reductions_per_wi * wg); + assert(reduction_groups == 1); + + sycl::event final_reduction_ev = + dpctl::tensor::kernels::submit_no_atomic_reduction< + T, T, ReductionOpT, InputOutputIterIndexerT, ReductionIndexerT, + gemm_reduction_over_group_temps_contig_krn>( + exec_q, temp_arg, res_tp, identity_val, wg, iter_nelems, + remaining_reduction_nelems, reductions_per_wi, reduction_groups, + in_out_iter_indexer, reduction_indexer, {dependent_ev}); + + return final_reduction_ev; + } +} + +template +class GemmBatchFunctorThreadK +{ +private: + const lhsT *lhs = nullptr; + const rhsT *rhs = nullptr; + resT *res = nullptr; + LocAccT workspace; + LocAccT local_B_block; + std::size_t n = 0; + std::size_t n_blocks = 0; + std::size_t delta_n = 0; + std::size_t k = 0; + std::size_t k_blocks = 0; + std::size_t delta_k = 0; + std::size_t n_wi = 0; + std::size_t m = 0; + std::size_t batch_nelems = 0; + BatchDimsIndexerT batch_indexer; + OuterInnerDimsIndexerT lhs_indexer; + OuterInnerDimsIndexerT rhs_indexer; + OuterInnerDimsIndexerT res_indexer; + +public: + GemmBatchFunctorThreadK(const lhsT *lhs_, + const rhsT *rhs_, + resT *res_, + LocAccT workspace_, + LocAccT local_B_block_, + std::size_t n_, + std::size_t n_blocks_, + std::size_t delta_n_, + std::size_t k_, + std::size_t k_blocks_, + std::size_t delta_k_, + std::size_t n_wi_, + std::size_t m_, + std::size_t batch_nelems_, + const BatchDimsIndexerT &batch_indexer_, + const OuterInnerDimsIndexerT &lhs_indexer_, + const OuterInnerDimsIndexerT &rhs_indexer_, + const OuterInnerDimsIndexerT &res_indexer_) + : lhs(lhs_), rhs(rhs_), res(res_), workspace(workspace_), + local_B_block(local_B_block_), n(n_), n_blocks(n_blocks_), + delta_n(delta_n_), k(k_), k_blocks(k_blocks_), delta_k(delta_k_), + n_wi(n_wi_), m(m_), batch_nelems(batch_nelems_), + batch_indexer(batch_indexer_), lhs_indexer(lhs_indexer_), + rhs_indexer(rhs_indexer_), res_indexer(res_indexer_) + { + } + + void operator()(sycl::nd_item<1> it) const + { + // for batching: + // (current matrix in batch) m_id = global_id / (global_range / + // batch_nelems) for lhs, offset = m_id * (n * k) for rhs, offset = + // m_id + // * (k * m) for res, offset = m_id * (n * m) + const std::size_t n_groups_per_batch = + it.get_group_range(0) / batch_nelems; + const std::size_t m_id = it.get_group_linear_id() / n_groups_per_batch; + const std::size_t gr_id = + it.get_group_linear_id() - m_id * n_groups_per_batch; + const std::size_t lid = it.get_local_linear_id(); + + const auto &three_offsets_ = batch_indexer(static_cast(m_id)); + + const auto &lhs_offset = three_offsets_.get_first_offset(); + const auto &rhs_offset = three_offsets_.get_second_offset(); + const auto &res_offset = three_offsets_.get_third_offset(); + + // lift gr_id -> (block_i, block_j, block_s) + // block_i moves fastest, then block_s, then block_j + + const std::size_t r_size = (n_blocks * k_blocks); + // 0 <= block_j < m_blocks, + const std::size_t block_j = gr_id / r_size; + // 0 <= block_r < n_blocks * k_blocks + const std::size_t block_r = gr_id - block_j * r_size; + // 0 <= block_s < k_blocks + const std::size_t block_s = block_r / n_blocks; + // 0 <= block_i < n_blocks + const std::size_t block_i = block_r - block_s * n_blocks; + + // 0 <= local_i < delta_n + const std::size_t local_i = lid / (delta_k); + // 0 <= local_s < delta_k + const std::size_t local_s = lid - local_i * (delta_k); + + std::size_t i = block_i * delta_n + local_i; + std::size_t j = m_groups * block_j; + std::size_t s = block_s * delta_k * n_wi + local_s; + + using accV_t = typename LocAccT::value_type; + + static constexpr resT identity_ = resT(0); + if (local_i == 0) { + for (std::size_t q = 0; q < n_wi * delta_k; q += delta_k) { + const std::size_t sq = s + q; + const std::size_t sqmj = sq * m + j; + + if constexpr (m_groups == 1 && std::is_same_v) { + local_B_block[local_s + q] = + (sq < k && j < m) + ? static_cast( + rhs[rhs_offset + rhs_indexer(sqmj)]) + : identity_; + } + else { + accV_t local_B_vec; +#pragma unroll + for (std::size_t vec_idx = 0; vec_idx < m_groups; + ++vec_idx) { + local_B_vec[vec_idx] = + (sq < k && j + vec_idx < m) + ? static_cast( + rhs[rhs_offset + + rhs_indexer(sqmj + vec_idx)]) + : identity_; + } + local_B_block[local_s + q] = local_B_vec; + } + } + } + + it.barrier(sycl::access::fence_space::local_space); + + std::size_t t_shift = block_s * delta_k * n_wi; + std::size_t global_s_offset = i * k + t_shift; + + accV_t private_sum(identity_); + static constexpr accV_t vec_identity_(identity_); + for (std::size_t t = local_s; t < local_B_block.size(); t += delta_k) { + private_sum += + ((i < n) && (t + t_shift < k)) + ? (static_cast( + lhs[lhs_offset + lhs_indexer(global_s_offset + t)]) * + local_B_block[t]) + : vec_identity_; + } + + std::size_t workspace_i_shift = local_i * delta_k; + workspace[workspace_i_shift + local_s] = private_sum; + + it.barrier(sycl::access::fence_space::local_space); + + if (local_s == 0 && i < n) { + accV_t local_sum(workspace[workspace_i_shift]); + for (std::size_t t = 1; t < delta_k; ++t) { + local_sum += workspace[workspace_i_shift + t]; + } + + sycl::atomic_ref + aout0(res[res_offset + res_indexer(i * m + j)]); + + if constexpr (m_groups == 1 && std::is_same_v) { + aout0 += local_sum; + } + else { + aout0 += local_sum[0]; + +#pragma unroll + for (std::size_t vec_id = 1; vec_id < m_groups; ++vec_id) { + if (j + vec_id < m) { + sycl::atomic_ref< + resT, sycl::memory_order::relaxed, + sycl::memory_scope::device, + sycl::access::address_space::global_space> + aout1(res[res_offset + + res_indexer(i * m + j + vec_id)]); + + aout1 += local_sum[vec_id]; + } + } + } + } + } +}; + +template +class gemm_init_krn; + +template +class gemm_k_krn; + +template +class gemm_nm_krn; + +template +class gemm_batch_k_krn; + +template +class gemm_batch_nm_krn; + +namespace gemm_detail +{ + +template +sycl::event _gemm_k_impl(sycl::queue &exec_q, + const lhsTy *lhs_tp, + const rhsTy *rhs_tp, + resTy *res_tp, + const std::size_t batch_nelems, + const std::size_t n, + const std::size_t k, + const std::size_t m, + const BatchIndexerT &batch_indexer, + const LhsIndexerT &lhs_indexer, + const RhsIndexerT &rhs_indexer, + const ResIndexerT &res_indexer, + const std::vector &depends) +{ + static constexpr std::size_t m_groups = 4; + const std::size_t delta_k(4); + std::size_t n_wi(64); + std::size_t delta_n(32); + + static_assert(std::is_same_v); + static_assert(std::is_same_v); + + const sycl::device &dev = exec_q.get_device(); + const std::size_t local_mem_size = + dev.get_info(); + const std::size_t reserved_slm_size = 512; + + gemm_detail::scale_gemm_k_parameters( + local_mem_size, reserved_slm_size, delta_k, + n_wi, // modified by reference + delta_n // modified by reference + ); + + std::size_t n_blocks = (n + delta_n - 1) / delta_n; + std::size_t m_blocks = (m + m_groups - 1) / m_groups; + std::size_t k_blocks = (k + n_wi * delta_k - 1) / (n_wi * delta_k); + + std::size_t lws = delta_n * delta_k; + + auto gRange = + sycl::range<1>(batch_nelems * n_blocks * m_blocks * k_blocks * lws); + auto lRange = sycl::range<1>(lws); + + auto ndRange = sycl::nd_range<1>(gRange, lRange); + + sycl::event gemm_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + using LocAccT = sycl::local_accessor, 1>; + LocAccT local_B_block(n_wi * delta_k, cgh); + LocAccT workspace(delta_n * delta_k, cgh); + + using KernelName = + class gemm_batch_k_krn; + cgh.parallel_for( + ndRange, + GemmBatchFunctorThreadK( + lhs_tp, rhs_tp, res_tp, std::move(workspace), + std::move(local_B_block), n, n_blocks, delta_n, k, k_blocks, + delta_k, n_wi, m, batch_nelems, batch_indexer, lhs_indexer, + rhs_indexer, res_indexer)); + }); + return gemm_ev; +} + +template +sycl::event _gemm_small_m_impl(sycl::queue &exec_q, + const lhsTy *lhs_tp, + const rhsTy *rhs_tp, + resTy *res_tp, + const std::size_t batch_nelems, + const std::size_t n, + const std::size_t k, + const std::size_t m, + const BatchIndexerT &batch_indexer, + const LhsIndexerT &lhs_indexer, + const RhsIndexerT &rhs_indexer, + const ResIndexerT &res_indexer, + const std::vector &depends) +{ + static constexpr std::size_t m_groups = 1; + const std::size_t delta_k(4); + std::size_t n_wi(64); + std::size_t delta_n(32); + + static_assert(std::is_same_v); + static_assert(std::is_same_v); + + const sycl::device &dev = exec_q.get_device(); + const std::size_t local_mem_size = + dev.get_info(); + const std::size_t reserved_slm_size = 512; + + gemm_detail::scale_gemm_k_parameters( + local_mem_size, reserved_slm_size, delta_k, + n_wi, // modified by reference + delta_n // modified by reference + ); + + std::size_t n_blocks = (n + delta_n - 1) / delta_n; + std::size_t m_blocks = (m + m_groups - 1) / m_groups; + std::size_t k_blocks = (k + n_wi * delta_k - 1) / (n_wi * delta_k); + + std::size_t lws = delta_n * delta_k; + + auto gRange = + sycl::range<1>(batch_nelems * n_blocks * m_blocks * k_blocks * lws); + auto lRange = sycl::range<1>(lws); + + auto ndRange = sycl::nd_range<1>(gRange, lRange); + + sycl::event gemm_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + using LocAccT = sycl::local_accessor; + LocAccT local_B_block(n_wi * delta_k, cgh); + LocAccT workspace(delta_n * delta_k, cgh); + + using KernelName = + class gemm_batch_k_krn; + cgh.parallel_for( + ndRange, + GemmBatchFunctorThreadK( + lhs_tp, rhs_tp, res_tp, std::move(workspace), + std::move(local_B_block), n, n_blocks, delta_n, k, k_blocks, + delta_k, n_wi, m, batch_nelems, batch_indexer, lhs_indexer, + rhs_indexer, res_indexer)); + }); + + return gemm_ev; +} + +} // end of namespace gemm_detail + +template +class GemmBatchFunctorThreadNM_vecm +{ +private: + const lhsT *lhs = nullptr; + const rhsT *rhs = nullptr; + resT *res = nullptr; + LocAccT1 local_lhs_block; + LocAccT2 local_rhs_block; + std::size_t batch_nelems; + std::size_t n = 0; + std::size_t k = 0; + std::size_t m = 0; + std::size_t n_groups = 0; + std::uint32_t wg_delta_n = 0; + std::uint32_t wg_delta_m = 0; + std::uint32_t wi_delta_k = 0; + BatchDimsIndexerT batch_indexer; + LhsIndexerT lhs_indexer; + RhsIndexerT rhs_indexer; + ResIndexerT res_indexer; + +public: + /*! @brief */ + GemmBatchFunctorThreadNM_vecm(const lhsT *lhs_, + const rhsT *rhs_, + resT *res_, + LocAccT1 local_lhs_block_, + LocAccT2 local_rhs_block_, + std::size_t batch_nelems_, + std::size_t n_, + std::size_t k_, + std::size_t m_, + std::size_t n_groups_, + std::size_t wg_delta_n_, + std::size_t wg_delta_m_, + std::size_t wi_delta_k_, + const BatchDimsIndexerT &batch_indexer_, + const LhsIndexerT &lhs_indexer_, + const RhsIndexerT &rhs_indexer_, + const ResIndexerT &res_indexer_) + : lhs(lhs_), rhs(rhs_), res(res_), local_lhs_block(local_lhs_block_), + local_rhs_block(local_rhs_block_), batch_nelems(batch_nelems_), n(n_), + k(k_), m(m_), n_groups(n_groups_), wg_delta_n(wg_delta_n_), + wg_delta_m(wg_delta_m_), wi_delta_k(wi_delta_k_), + batch_indexer(batch_indexer_), lhs_indexer(lhs_indexer_), + rhs_indexer(rhs_indexer_), res_indexer(res_indexer_) + { + } + + void operator()(sycl::nd_item<1> it) const + { + static constexpr resT zero_(0); + static constexpr std::uint32_t wi_total_delta_m = + wi_delta_m_vecs * m_vec_size; + + const std::size_t gws_per_batch = it.get_group_range(0) / batch_nelems; + const std::size_t batch_id = it.get_group_linear_id() / gws_per_batch; + const std::size_t gr_id = + it.get_group_linear_id() - batch_id * gws_per_batch; + + const auto &three_offsets_ = + batch_indexer(static_cast(batch_id)); + + const auto &lhs_offset = three_offsets_.get_first_offset(); + const auto &rhs_offset = three_offsets_.get_second_offset(); + const auto &res_offset = three_offsets_.get_third_offset(); + + // 0 <= block_j < m_groups + const std::size_t block_j = gr_id / n_groups; + // 0 <= block_i < n_groups + const std::size_t block_i = gr_id - block_j * n_groups; + + // Assumption: lws == wg_delta_n * wg_delta_m + const std::uint32_t lid = it.get_local_linear_id(); + // 0 <= local_j < (lws / wg_delta_n == wg_delta_m) + const std::uint32_t local_j = lid / wg_delta_n; + // sub-group lanes map to adjacent local_i + const std::uint32_t local_i = lid - local_j * wg_delta_n; + + // Coordinates of the block of C the work-group works on + std::size_t i = block_i * wg_delta_n * wi_delta_n; + std::size_t j = block_j * wg_delta_m * wi_total_delta_m; + + using slmA_t = typename LocAccT1::value_type; + using slmB_t = typename LocAccT2::value_type; + + const std::size_t a_st0 = k; + const std::size_t a_st1 = 1; + + const std::size_t b_st0 = m; + const std::size_t b_st1 = 1; + + const std::size_t c_st0 = m; + const std::size_t c_st1 = 1; + + // allocate/initialize private matrix C + // size ( wi_total_delta_n, wi_total_delta_m ) + static constexpr std::uint32_t C_size = wi_delta_n * wi_delta_m_vecs; + std::array private_C{slmB_t{zero_}}; + + for (std::size_t s = 0; s < k; s += wi_delta_k) { + // populate local_lhs_block ( wg_delta_n * wi_delta_n, + // wi_delta_k) + for (std::uint32_t vid = lid; vid < local_lhs_block.size(); + vid += it.get_local_range()[0]) { + // 0 <= v_i < wg_delta_n * wi_delta_n + const std::uint32_t v_i = vid / wi_delta_k; + // 0 <= v_s < wi_delta_k + const std::uint32_t v_s = vid - v_i * wi_delta_k; + + const std::size_t g_i = i + v_i; + const std::size_t g_s = s + v_s; + + const std::uint32_t mapped_vid = + wg_delta_n * wi_delta_n * v_s + v_i; + local_lhs_block[mapped_vid] = + (g_i < n && g_s < k) + ? static_cast( + lhs[lhs_offset + + lhs_indexer(g_i * a_st0 + g_s * a_st1)]) + : zero_; + } + + // populate local_rhs_block> ( wg_delta_m * + // wi_delta_m_vecs, wi_delta_k ) + for (std::uint32_t vid = lid; vid < local_rhs_block.size(); + vid += it.get_local_range()[0]) { + // 0 <= v_j < wg_delta_m * wi_delta_m_vecs + const std::uint32_t v_j = vid / wi_delta_k; + // 0 <= v_s < wi_delta_k + const std::uint32_t v_s = vid - v_j * wi_delta_k; + + const std::size_t g_j = j + v_j * m_vec_size; + const std::size_t g_s = s + v_s; + const std::uint32_t mapped_vid = + wg_delta_m * wi_delta_m_vecs * v_s + v_j; + + if constexpr (m_vec_size == 1) { + local_rhs_block[mapped_vid] = + (g_j < m && g_s < k) + ? static_cast( + rhs[rhs_offset + + rhs_indexer(g_s * b_st0 + g_j * b_st1)]) + : zero_; + } + else { + slmB_t vec{}; +#pragma unroll + for (std::uint32_t lane_id = 0; lane_id < m_vec_size; + ++lane_id) { + const std::size_t g_j1 = g_j + lane_id; + vec[lane_id] = (g_j1 < m && g_s < k) + ? static_cast( + rhs[rhs_offset + + rhs_indexer(g_s * b_st0 + + g_j1 * b_st1)]) + : zero_; + }; + + local_rhs_block[mapped_vid] = vec; + } + } + + it.barrier(sycl::access::fence_space::local_space); + + const std::uint32_t lo_lhs_st_k = (wg_delta_n * wi_delta_n); + const std::uint32_t lo_rhs_rk_k = (wg_delta_m * wi_delta_m_vecs); + for (std::uint32_t pr_k = 0; pr_k < wi_delta_k; ++pr_k) { + std::array pr_lhs{}; +#pragma unroll + for (std::uint32_t pr_i = 0; pr_i < wi_delta_n; ++pr_i) { + pr_lhs[pr_i] = + local_lhs_block[pr_k * lo_lhs_st_k + + (local_i + pr_i * wg_delta_n)]; + } + + std::array pr_rhs{}; +#pragma unroll + for (std::uint32_t pr_j = 0; pr_j < wi_delta_m_vecs; ++pr_j) { + pr_rhs[pr_j] = + local_rhs_block[pr_k * lo_rhs_rk_k + + (local_j + pr_j * wg_delta_m)]; + } + +#pragma unroll + for (std::uint32_t pr_i = 0; pr_i < wi_delta_n; ++pr_i) { +#pragma unroll + for (std::uint32_t pr_j = 0; pr_j < wi_delta_m_vecs; + ++pr_j) { + private_C[pr_i * wi_delta_m_vecs + pr_j] += + pr_lhs[pr_i] * pr_rhs[pr_j]; + } + } + } + + it.barrier(sycl::access::fence_space::local_space); + } + + if constexpr (m_vec_size == 1) { +#pragma unroll + for (std::uint32_t pr_i = 0; pr_i < wi_delta_n; ++pr_i) { + std::size_t out_i = i + local_i + pr_i * wg_delta_n; + if (out_i < n) { +#pragma unroll + for (std::uint32_t pr_j = 0; pr_j < wi_delta_m_vecs; + ++pr_j) { + const std::size_t out_j = + j + (local_j + pr_j * wg_delta_m) * m_vec_size; + const std::size_t out_flat_id = + out_i * c_st0 + out_j * c_st1; + if (out_j < m) { + res[res_offset + res_indexer(out_flat_id)] = + private_C[pr_i * wi_delta_m_vecs + pr_j]; + } + } + } + } + } + else { +#pragma unroll + for (std::uint32_t pr_i = 0; pr_i < wi_delta_n; ++pr_i) { + std::size_t out_i = i + local_i + pr_i * wg_delta_n; + if (out_i < n) { + // could be unrolled + for (std::uint32_t pr_j = 0; pr_j < wi_delta_m_vecs; + ++pr_j) { + std::size_t out_j = + j + (local_j + pr_j * wg_delta_m) * m_vec_size; +#pragma unroll + for (std::uint32_t lane_id = 0; lane_id < m_vec_size; + ++lane_id) { + const std::size_t out_flat_id = + out_i * c_st0 + (out_j + lane_id) * c_st1; + if (out_j + lane_id < m) { + res[res_offset + res_indexer(out_flat_id)] = + private_C[pr_i * wi_delta_m_vecs + pr_j] + [lane_id]; + } + } + } + } + } + } + } +}; + +struct GemmBatchFunctorThreadNM_vecm_HyperParameters +{ +private: + std::uint32_t wi_delta_n = 2; + std::uint32_t wi_delta_m_vecs = 4; + std::uint32_t m_vec_size = 1; + +public: + constexpr GemmBatchFunctorThreadNM_vecm_HyperParameters(); + constexpr GemmBatchFunctorThreadNM_vecm_HyperParameters( + std::uint32_t wi_delta_n_, + std::uint32_t wi_delta_m_vecs_, + std::uint32_t m_vec_size_) + : wi_delta_n(wi_delta_n_), wi_delta_m_vecs(wi_delta_m_vecs_), + m_vec_size(m_vec_size_) + { + } + + constexpr std::uint32_t get_wi_delta_n() const { return wi_delta_n; } + constexpr std::uint32_t get_wi_delta_m_vecs() const + { + return wi_delta_m_vecs; + } + constexpr std::uint32_t get_m_vec_size() const { return m_vec_size; } +}; + +template +struct GemmBatchFunctorThreadNM_vecm_HyperParametersSelector +{ + constexpr GemmBatchFunctorThreadNM_vecm_HyperParametersSelector() {} + + constexpr GemmBatchFunctorThreadNM_vecm_HyperParameters get() const + { + if constexpr (sizeof(resT) == 1) { + // 1 * 8 * 2 * 4 == 64 + return GemmBatchFunctorThreadNM_vecm_HyperParameters(8, 2, 4); + } + else if constexpr (sizeof(resT) == 2) { + // 2 * 4 * 2 * 4 == 64 + return GemmBatchFunctorThreadNM_vecm_HyperParameters(4, 2, 4); + } + else if constexpr (sizeof(resT) == 4) { + // 4 * 4 * 1 * 4 == 64 + return GemmBatchFunctorThreadNM_vecm_HyperParameters(4, 1, 4); + } + else if constexpr (sizeof(resT) == 8) { + // 8 * 2 * 1 * 4 == 64 + if constexpr (std::is_same_v>) { + return GemmBatchFunctorThreadNM_vecm_HyperParameters(2, 4, 1); + } + else { + return GemmBatchFunctorThreadNM_vecm_HyperParameters(2, 1, 4); + } + } + else if constexpr (std::is_same_v>) { + // 16 * 2 * 2 * 1 == 64 + return GemmBatchFunctorThreadNM_vecm_HyperParameters(2, 2, 1); + } + else { + return GemmBatchFunctorThreadNM_vecm_HyperParameters(2, 2, 1); + } + } +}; + +template +class gemm_batch_nm_vecm_krn; + +namespace gemm_detail +{ + +template +std::tuple + get_wg_delta_m_and_wi_delta_k(const std::size_t slm_byte_size, + const std::uint32_t wg_delta_n, + const std::uint32_t suggested_wg_delta_m) +{ + std::uint32_t wg_delta_m = suggested_wg_delta_m; + + const std::size_t slm_max_rows = + slm_byte_size / + ((wg_delta_n * wi_delta_n + wg_delta_m * wi_delta_m) * sizeof(T)); + + std::uint32_t wi_delta_k = + (slm_max_rows >= 64) + ? 64 + : 32 * static_cast(slm_max_rows / 32); + + for (std::uint32_t it = 0; !wi_delta_k && (it < 4); ++it) { + wg_delta_m /= 2; + + const std::size_t slm_max_rows = + slm_byte_size / + ((wg_delta_n * wi_delta_n + wg_delta_m * wi_delta_m) * sizeof(T)); + + wi_delta_k = + (slm_max_rows >= 64) + ? 64 + : ((slm_max_rows >= 32) + ? 32 + : (slm_max_rows >= 16 ? 16 + : 8 * static_cast( + slm_max_rows / 8))); + } + + if (!wi_delta_k) { + throw std::runtime_error("Insufficient resources"); + } + + return std::make_tuple(wg_delta_m, wi_delta_k); +} + +template +sycl::event _gemm_batch_nm_impl(sycl::queue &exec_q, + const lhsTy *lhs_tp, + const rhsTy *rhs_tp, + resTy *res_tp, + const std::size_t batch_nelems, + const std::size_t n, + const std::size_t k, + const std::size_t m, + const BatchIndexerT &batch_indexer, + const LhsIndexerT &lhs_indexer, + const RhsIndexerT &rhs_indexer, + const ResIndexerT &res_indexer, + std::vector const &depends) +{ + static constexpr GemmBatchFunctorThreadNM_vecm_HyperParametersSelector< + resTy> + selector{}; + static constexpr auto hyper_params = selector.get(); + + static constexpr std::uint32_t wi_delta_n = hyper_params.get_wi_delta_n(); + static constexpr std::uint32_t wi_delta_m_vecs = + hyper_params.get_wi_delta_m_vecs(); + static constexpr std::uint32_t m_vec_size = hyper_params.get_m_vec_size(); + + static constexpr std::uint32_t wi_total_delta_m = + wi_delta_m_vecs * m_vec_size; + + using KernelName = + class gemm_batch_nm_vecm_krn; + + const auto &kernel_id = sycl::get_kernel_id(); + + auto const &ctx = exec_q.get_context(); + auto const &dev = exec_q.get_device(); + auto kb = sycl::get_kernel_bundle( + ctx, {dev}, {kernel_id}); + + auto krn = kb.get_kernel(kernel_id); + + const std::uint32_t max_sg_size = krn.template get_info< + sycl::info::kernel_device_specific::max_sub_group_size>(dev); + + const std::size_t k_wg_sz = krn.template get_info< + sycl::info::kernel_device_specific::work_group_size>(dev); + + // Limit work-group size + static constexpr std::size_t wg_sz_limit(2048); + const std::size_t max_wg_sz = std::min(wg_sz_limit, k_wg_sz); + + const std::uint32_t max_subgroups_per_wg = + static_cast(max_wg_sz / max_sg_size); + + const std::size_t reserved_slm_byte_size = 512; + const std::size_t slm_byte_size = + dev.get_info(); + + const std::uint32_t wg_delta_n = max_sg_size; + std::uint32_t wg_delta_m = 0; + std::uint32_t wi_delta_k = 0; + + std::tie(wg_delta_m, wi_delta_k) = + get_wg_delta_m_and_wi_delta_k( + slm_byte_size - reserved_slm_byte_size, wg_delta_n, + max_subgroups_per_wg); + + const std::uint32_t lws = wg_delta_n * wg_delta_m; + + const std::size_t n_groups = + (n + wg_delta_n * wi_delta_n - 1) / (wg_delta_n * wi_delta_n); + const std::size_t m_groups = (m + wg_delta_m * wi_total_delta_m - 1) / + (wg_delta_m * wi_total_delta_m); + + const std::size_t gws = lws * batch_nelems * n_groups * m_groups; + + sycl::range<1> lRange(lws); + sycl::range<1> gRange(gws); + sycl::nd_range<1> ndRange(gRange, lRange); + + using slmB_t = + typename std::conditional>::type; + + sycl::event gemm_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + cgh.use_kernel_bundle(kb); + + using LocAccT1 = sycl::local_accessor; + LocAccT1 local_A_block(wg_delta_n * wi_delta_n * wi_delta_k, cgh); + + using LocAccT2 = sycl::local_accessor; + LocAccT2 local_B_block(wg_delta_m * wi_delta_m_vecs * wi_delta_k, cgh); + + using Impl_FunctorT = GemmBatchFunctorThreadNM_vecm< + lhsTy, rhsTy, resTy, LocAccT1, LocAccT2, BatchIndexerT, LhsIndexerT, + RhsIndexerT, ResIndexerT, wi_delta_n, wi_delta_m_vecs, m_vec_size>; + + cgh.parallel_for( + ndRange, Impl_FunctorT( + lhs_tp, rhs_tp, res_tp, std::move(local_A_block), + std::move(local_B_block), batch_nelems, n, k, m, + n_groups, wg_delta_n, wg_delta_m, wi_delta_k, + batch_indexer, lhs_indexer, rhs_indexer, res_indexer)); + }); + return gemm_ev; +} + +} // namespace gemm_detail + +typedef sycl::event (*gemm_impl_fn_ptr_t)( + sycl::queue &, + const char *, // lhs + const char *, // rhs + char *, // res + std::size_t, // lhs_outer_nelems (n) + std::size_t, // inner_nelems (k) + std::size_t, // rhs_outer_nelems (m) + int, // inner nd + int, // lhs outer nd + const ssize_t *, // lhs shape and strides + int, // rhs outer nd + const ssize_t *, // rhs shape and strides + int, // res outer nd + const ssize_t *, // res shape and strides + std::vector const &); + +template +sycl::event gemm_impl(sycl::queue &exec_q, + const char *lhs_cp, + const char *rhs_cp, + char *res_cp, + std::size_t n, + std::size_t k, + std::size_t m, + int inner_nd, + int lhs_outer_nd, + const ssize_t *lhs_shape_strides, + int rhs_outer_nd, + const ssize_t *rhs_shape_strides, + int res_outer_nd, + const ssize_t *res_shape_strides, + std::vector const &depends = {}) +{ + const lhsTy *lhs_tp = reinterpret_cast(lhs_cp); + const rhsTy *rhs_tp = reinterpret_cast(rhs_cp); + resTy *res_tp = reinterpret_cast(res_cp); + + using OuterInnerIndexerT = dpctl::tensor::offset_utils::StridedIndexer; + const OuterInnerIndexerT lhs_indexer(inner_nd + lhs_outer_nd, 0, + lhs_shape_strides); + const OuterInnerIndexerT rhs_indexer(inner_nd + rhs_outer_nd, 0, + rhs_shape_strides); + const OuterInnerIndexerT res_indexer(res_outer_nd, 0, res_shape_strides); + + using BatchIndexerT = dpctl::tensor::offset_utils::ThreeZeroOffsets_Indexer; + static constexpr BatchIndexerT batch_indexer{}; + + static constexpr std::size_t single_batch_nelems = 1; + + const std::size_t min_nm = std::min(n, m); + const std::size_t max_nm = std::max(n, m); + + if (min_nm > 0 && (max_nm >= ((64 * 1024) / min_nm))) { + return gemm_detail::_gemm_batch_nm_impl< + lhsTy, rhsTy, resTy, BatchIndexerT, OuterInnerIndexerT, + OuterInnerIndexerT, OuterInnerIndexerT>( + exec_q, lhs_tp, rhs_tp, res_tp, single_batch_nelems, n, k, m, + batch_indexer, lhs_indexer, rhs_indexer, res_indexer, depends); + } + + sycl::event res_init_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + using IndexerT = dpctl::tensor::offset_utils::StridedIndexer; + const IndexerT res_indexer(res_outer_nd, 0, res_shape_strides); + using InitKernelName = class gemm_init_krn; + cgh.parallel_for( + sycl::range<1>(n * m), [=](sycl::id<1> id) { + auto res_offset = res_indexer(id[0]); + res_tp[res_offset] = resTy(0); + }); + }); + + if (k == 0) { + return res_init_ev; + } + + if ((max_nm < 64)) { + if (m < 4) { + return gemm_detail::_gemm_small_m_impl< + lhsTy, rhsTy, resTy, BatchIndexerT, OuterInnerIndexerT, + OuterInnerIndexerT, OuterInnerIndexerT>( + exec_q, lhs_tp, rhs_tp, res_tp, single_batch_nelems, n, k, m, + batch_indexer, lhs_indexer, rhs_indexer, res_indexer, + {res_init_ev}); + } + return gemm_detail::_gemm_k_impl( + exec_q, lhs_tp, rhs_tp, res_tp, single_batch_nelems, n, k, m, + batch_indexer, lhs_indexer, rhs_indexer, res_indexer, + {res_init_ev}); + } + + return gemm_detail::_gemm_batch_nm_impl< + lhsTy, rhsTy, resTy, BatchIndexerT, OuterInnerIndexerT, + OuterInnerIndexerT, OuterInnerIndexerT>( + exec_q, lhs_tp, rhs_tp, res_tp, single_batch_nelems, n, k, m, + batch_indexer, lhs_indexer, rhs_indexer, res_indexer, {res_init_ev}); +} + +typedef sycl::event (*gemm_contig_impl_fn_ptr_t)( + sycl::queue &, + const char *, // lhs + const char *, // rhs + char *, // res + std::size_t, // n + std::size_t, // k + std::size_t, // m + std::vector const &); + +template +sycl::event gemm_contig_impl(sycl::queue &exec_q, + const char *lhs_cp, + const char *rhs_cp, + char *res_cp, + std::size_t n, + std::size_t k, + std::size_t m, + std::vector const &depends = {}) +{ + const lhsTy *lhs_tp = reinterpret_cast(lhs_cp); + const rhsTy *rhs_tp = reinterpret_cast(rhs_cp); + resTy *res_tp = reinterpret_cast(res_cp); + + using OuterInnerIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + static constexpr OuterInnerIndexerT lhs_indexer{}; + static constexpr OuterInnerIndexerT rhs_indexer{}; + static constexpr OuterInnerIndexerT res_indexer{}; + + using BatchIndexerT = dpctl::tensor::offset_utils::ThreeZeroOffsets_Indexer; + static constexpr BatchIndexerT batch_indexer{}; + + static constexpr std::size_t single_batch_nelems = 1; + + const std::size_t min_nm = std::min(n, m); + const std::size_t max_nm = std::max(n, m); + if (min_nm > 0 && (max_nm >= ((64 * 1024) / min_nm))) { + return gemm_detail::_gemm_batch_nm_impl< + lhsTy, rhsTy, resTy, BatchIndexerT, OuterInnerIndexerT, + OuterInnerIndexerT, OuterInnerIndexerT>( + exec_q, lhs_tp, rhs_tp, res_tp, single_batch_nelems, n, k, m, + batch_indexer, lhs_indexer, rhs_indexer, res_indexer, depends); + } + + sycl::event res_init_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + cgh.fill(res_tp, resTy(0), n * m); + }); + + if (k == 0) { + return res_init_ev; + } + + if (max_nm < 64) { + if (m < 4) { + return gemm_detail::_gemm_small_m_impl< + lhsTy, rhsTy, resTy, BatchIndexerT, OuterInnerIndexerT, + OuterInnerIndexerT, OuterInnerIndexerT>( + exec_q, lhs_tp, rhs_tp, res_tp, single_batch_nelems, n, k, m, + batch_indexer, lhs_indexer, rhs_indexer, res_indexer, + {res_init_ev}); + } + return gemm_detail::_gemm_k_impl( + exec_q, lhs_tp, rhs_tp, res_tp, single_batch_nelems, n, k, m, + batch_indexer, lhs_indexer, rhs_indexer, res_indexer, + {res_init_ev}); + } + + return gemm_detail::_gemm_batch_nm_impl< + lhsTy, rhsTy, resTy, BatchIndexerT, OuterInnerIndexerT, + OuterInnerIndexerT, OuterInnerIndexerT>( + exec_q, lhs_tp, rhs_tp, res_tp, single_batch_nelems, n, k, m, + batch_indexer, lhs_indexer, rhs_indexer, res_indexer, {res_init_ev}); +} + +template +class gemm_batch_init_krn; + +typedef sycl::event (*gemm_batch_impl_fn_ptr_t)( + sycl::queue &, + const char *, // lhs + const char *, // rhs + char *, // res + std::size_t, // batch nelems + std::size_t, // lhs outer nelems (n) + std::size_t, // inner nelems (k) + std::size_t, // rhs outer nelems (m) + int, // batching nd + const ssize_t *, // batch shape strides + ssize_t, // lhs batch offset + ssize_t, // rhs batch offset + ssize_t, // res batch offset + int, // inner dims + int, // lhs outer dims + const ssize_t *, // lhs outer and inner shape and strides + int, // rhs outer dims + const ssize_t *, // rhs outer and inner shape and strides + int, // res outer dims + const ssize_t *, // res outer and inner shape and strides + const ssize_t *, // res full shape and strides + std::vector const &); + +template +sycl::event gemm_batch_impl(sycl::queue &exec_q, + const char *lhs_cp, + const char *rhs_cp, + char *res_cp, + std::size_t batch_nelems, + std::size_t n, + std::size_t k, + std::size_t m, + int batch_nd, + const ssize_t *batch_shape_strides, + ssize_t lhs_batch_offset, + ssize_t rhs_batch_offset, + ssize_t res_batch_offset, + int inner_nd, + int lhs_outer_nd, + const ssize_t *lhs_outer_inner_shapes_strides, + int rhs_outer_nd, + const ssize_t *rhs_outer_inner_shapes_strides, + int res_outer_nd, + const ssize_t *res_outer_shapes_strides, + const ssize_t *res_shape_strides, + std::vector const &depends = {}) +{ + const lhsTy *lhs_tp = reinterpret_cast(lhs_cp); + const rhsTy *rhs_tp = reinterpret_cast(rhs_cp); + resTy *res_tp = reinterpret_cast(res_cp); + + using OuterInnerDimsIndexerT = dpctl::tensor::offset_utils::StridedIndexer; + const OuterInnerDimsIndexerT lhs_indexer(inner_nd + lhs_outer_nd, 0, + lhs_outer_inner_shapes_strides); + const OuterInnerDimsIndexerT rhs_indexer(inner_nd + rhs_outer_nd, 0, + rhs_outer_inner_shapes_strides); + const OuterInnerDimsIndexerT res_indexer(res_outer_nd, 0, + res_outer_shapes_strides); + using BatchDimsIndexerT = + dpctl::tensor::offset_utils::ThreeOffsets_StridedIndexer; + const BatchDimsIndexerT batch_indexer(batch_nd, lhs_batch_offset, + rhs_batch_offset, res_batch_offset, + batch_shape_strides); + + const std::size_t min_nm = std::min(n, m); + const std::size_t max_nm = std::max(n, m); + + if (min_nm > 0 && (max_nm >= ((64 * 1024) / min_nm))) { + return gemm_detail::_gemm_batch_nm_impl< + lhsTy, rhsTy, resTy, BatchDimsIndexerT, OuterInnerDimsIndexerT, + OuterInnerDimsIndexerT, OuterInnerDimsIndexerT>( + exec_q, lhs_tp, rhs_tp, res_tp, batch_nelems, n, k, m, + batch_indexer, lhs_indexer, rhs_indexer, res_indexer, depends); + } + + sycl::event res_init_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + using IndexerT = dpctl::tensor::offset_utils::StridedIndexer; + const IndexerT res_indexer(batch_nd + res_outer_nd, res_batch_offset, + res_shape_strides); + using InitKernelName = class gemm_batch_init_krn; + cgh.parallel_for( + sycl::range<1>(n * m * batch_nelems), [=](sycl::id<1> id) { + auto res_offset = res_indexer(id[0]); + res_tp[res_offset] = resTy(0); + }); + }); + + if (k == 0) { + return res_init_ev; + } + + if (m < 4) { + return gemm_detail::_gemm_small_m_impl< + lhsTy, rhsTy, resTy, BatchDimsIndexerT, OuterInnerDimsIndexerT, + OuterInnerDimsIndexerT, OuterInnerDimsIndexerT>( + exec_q, lhs_tp, rhs_tp, res_tp, batch_nelems, n, k, m, + batch_indexer, lhs_indexer, rhs_indexer, res_indexer, + {res_init_ev}); + } + else if (k > n && k > m) { + return gemm_detail::_gemm_k_impl< + lhsTy, rhsTy, resTy, BatchDimsIndexerT, OuterInnerDimsIndexerT, + OuterInnerDimsIndexerT, OuterInnerDimsIndexerT>( + exec_q, lhs_tp, rhs_tp, res_tp, batch_nelems, n, k, m, + batch_indexer, lhs_indexer, rhs_indexer, res_indexer, + {res_init_ev}); + } + else { + return gemm_detail::_gemm_batch_nm_impl< + lhsTy, rhsTy, resTy, BatchDimsIndexerT, OuterInnerDimsIndexerT, + OuterInnerDimsIndexerT, OuterInnerDimsIndexerT>( + exec_q, lhs_tp, rhs_tp, res_tp, batch_nelems, n, k, m, + batch_indexer, lhs_indexer, rhs_indexer, res_indexer, + {res_init_ev}); + } +} + +typedef sycl::event (*gemm_batch_contig_impl_fn_ptr_t)( + sycl::queue &, + const char *, // lhs + const char *, // rhs + char *, // res + std::size_t, // batch nelems + std::size_t, // n + std::size_t, // k + std::size_t, // m + ssize_t, // lhs batch offset + ssize_t, // rhs batch offset + ssize_t, // res batch offset + std::vector const &); + +template +sycl::event gemm_batch_contig_impl(sycl::queue &exec_q, + const char *lhs_cp, + const char *rhs_cp, + char *res_cp, + std::size_t batch_nelems, + std::size_t n, + std::size_t k, + std::size_t m, + ssize_t lhs_batch_offset, + ssize_t rhs_batch_offset, + ssize_t res_batch_offset, + std::vector const &depends = {}) +{ + const lhsTy *lhs_tp = + reinterpret_cast(lhs_cp) + lhs_batch_offset; + const rhsTy *rhs_tp = + reinterpret_cast(rhs_cp) + rhs_batch_offset; + resTy *res_tp = reinterpret_cast(res_cp) + res_batch_offset; + + using OuterInnerDimsIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + static constexpr OuterInnerDimsIndexerT lhs_indexer{}; + static constexpr OuterInnerDimsIndexerT rhs_indexer{}; + static constexpr OuterInnerDimsIndexerT res_indexer{}; + + using dpctl::tensor::offset_utils::Strided1DIndexer; + using dpctl::tensor::offset_utils::ThreeOffsets_CombinedIndexer; + using BatchDimsIndexerT = + ThreeOffsets_CombinedIndexer; + + const BatchDimsIndexerT batch_indexer( + Strided1DIndexer{/* size */ batch_nelems, + /* step */ n * k}, + Strided1DIndexer{/* size */ batch_nelems, + /* step */ k * m}, + Strided1DIndexer{/* size */ batch_nelems, + /* step */ n * m}); + + const std::size_t min_nm = std::min(n, m); + const std::size_t max_nm = std::max(n, m); + + if (min_nm > 0 && (max_nm >= ((64 * 1024) / min_nm))) { + return gemm_detail::_gemm_batch_nm_impl< + lhsTy, rhsTy, resTy, BatchDimsIndexerT, OuterInnerDimsIndexerT, + OuterInnerDimsIndexerT, OuterInnerDimsIndexerT>( + exec_q, lhs_tp, rhs_tp, res_tp, batch_nelems, n, k, m, + batch_indexer, lhs_indexer, rhs_indexer, res_indexer, depends); + } + + sycl::event res_init_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + cgh.fill(res_tp, resTy(0), n * m * batch_nelems); + }); + + if (k == 0) { + return res_init_ev; + } + + if (max_nm < 64) { + if (m < 4) { + return gemm_detail::_gemm_small_m_impl< + lhsTy, rhsTy, resTy, BatchDimsIndexerT, OuterInnerDimsIndexerT, + OuterInnerDimsIndexerT, OuterInnerDimsIndexerT>( + exec_q, lhs_tp, rhs_tp, res_tp, batch_nelems, n, k, m, + batch_indexer, lhs_indexer, rhs_indexer, res_indexer, + {res_init_ev}); + } + return gemm_detail::_gemm_k_impl< + lhsTy, rhsTy, resTy, BatchDimsIndexerT, OuterInnerDimsIndexerT, + OuterInnerDimsIndexerT, OuterInnerDimsIndexerT>( + exec_q, lhs_tp, rhs_tp, res_tp, batch_nelems, n, k, m, + batch_indexer, lhs_indexer, rhs_indexer, res_indexer, + {res_init_ev}); + } + + return gemm_detail::_gemm_batch_nm_impl< + lhsTy, rhsTy, resTy, BatchDimsIndexerT, OuterInnerDimsIndexerT, + OuterInnerDimsIndexerT, OuterInnerDimsIndexerT>( + exec_q, lhs_tp, rhs_tp, res_tp, batch_nelems, n, k, m, batch_indexer, + lhs_indexer, rhs_indexer, res_indexer, {res_init_ev}); +} + +// ========== Gemm Tree + +template +class GemmBatchNoAtomicFunctorThreadNM +{ +private: + const lhsT *lhs = nullptr; + const rhsT *rhs = nullptr; + resT *res = nullptr; + LocAccT1 local_A_block; + LocAccT2 local_B_block; + std::size_t n = 0; + std::size_t wg_delta_n = 0; + std::size_t k = 0; + std::size_t k_blocks = 0; + std::size_t wi_delta_k = 0; + std::size_t m = 0; + std::size_t m_blocks = 0; + std::size_t wg_delta_m = 0; + std::size_t batch_nelems; + BatchDimsIndexerT batch_indexer; + OuterInnerDimsIndexerT lhs_indexer; + OuterInnerDimsIndexerT rhs_indexer; + ResIndexerT res_indexer; + +public: + GemmBatchNoAtomicFunctorThreadNM(const lhsT *lhs_, + const rhsT *rhs_, + resT *res_, + LocAccT1 local_A_block_, + LocAccT2 local_B_block_, + std::size_t n_, + std::size_t wg_delta_n_, + std::size_t k_, + std::size_t k_blocks_, + std::size_t wi_delta_k_, + std::size_t m_, + std::size_t m_blocks_, + std::size_t wg_delta_m_, + std::size_t batch_nelems_, + const BatchDimsIndexerT batch_indexer_, + const OuterInnerDimsIndexerT lhs_indexer_, + const OuterInnerDimsIndexerT rhs_indexer_, + const ResIndexerT res_indexer_) + : lhs(lhs_), rhs(rhs_), res(res_), local_A_block(local_A_block_), + local_B_block(local_B_block_), n(n_), wg_delta_n(wg_delta_n_), k(k_), + k_blocks(k_blocks_), wi_delta_k(wi_delta_k_), m(m_), + m_blocks(m_blocks_), wg_delta_m(wg_delta_m_), + batch_nelems(batch_nelems_), batch_indexer(batch_indexer_), + lhs_indexer(lhs_indexer_), rhs_indexer(rhs_indexer_), + res_indexer(res_indexer_) + { + } + + void operator()(sycl::nd_item<1> it) const + { + const std::size_t n_groups_per_batch = + it.get_group_range(0) / batch_nelems; + const std::size_t m_id = it.get_group_linear_id() / n_groups_per_batch; + const std::size_t gr_id = + it.get_group_linear_id() - m_id * n_groups_per_batch; + + const auto &three_offsets_ = batch_indexer(static_cast(m_id)); + + // lift group_id to (block_i, block_j, block_s), + // 0 <= block_i < n_blocks, 0 <= block_j < m_blocks, 0 <= block_s + // < k_blocks + + const auto &lhs_offset = three_offsets_.get_first_offset(); + const auto &rhs_offset = three_offsets_.get_second_offset(); + const auto &res_offset = three_offsets_.get_third_offset(); + + std::size_t block_i = gr_id / (m_blocks * k_blocks); + std::size_t block_r = gr_id - block_i * (m_blocks * k_blocks); + std::size_t block_j = block_r / k_blocks; + std::size_t block_s = block_r - block_j * k_blocks; + + std::size_t lid = it.get_local_linear_id(); + std::size_t local_i = lid / wg_delta_m; // 0<= local_i < wg_delta_n + std::size_t local_j = + lid - local_i * wg_delta_m; // 0<= local_j < wg_delta_m + + // load A block and B blocks into SLM + + std::size_t i = block_i * wi_delta_n * wg_delta_n; + std::size_t j = block_j * wi_delta_m * wg_delta_m; + std::size_t s = block_s * wi_delta_k; + + const std::int64_t a_st0 = k; + const std::int64_t a_st1 = 1; + + const std::int64_t b_st0 = m; + const std::int64_t b_st1 = 1; + + const std::int64_t c_st0 = m; + const std::int64_t c_st1 = 1; + + std::size_t lws = it.get_local_range(0); + + for (std::size_t vid = lid; vid < local_A_block.size(); vid += lws) { + std::size_t v_i = + vid / wi_delta_k; // 0<= v_i < wg_delta_n * wi_delta_n + std::size_t v_s = vid - v_i * wi_delta_k; // 0<= v_s < wi_delta_k + + std::size_t g_i = i + v_i; + std::size_t g_s = s + v_s; + + local_A_block[vid] = + (g_i < n && g_s < k) + ? static_cast( + lhs[lhs_offset + + lhs_indexer(g_i * a_st0 + g_s * a_st1)]) + : resT(0); + } + + using slmB_t = typename LocAccT2::value_type; + + for (std::size_t vid = lid; vid < local_B_block.size(); vid += lws) { + std::size_t v_j = vid / wi_delta_k; // 0<= v_i < wg_delta_m + std::size_t v_s = vid - v_j * wi_delta_k; // 0<= v_s < wi_delta_k + + std::size_t g_j = j + v_j * wi_delta_m; + std::size_t g_s = s + v_s; + + if constexpr (wi_delta_m == 1 && std::is_same_v) { + local_B_block[vid] = + (g_j < m && g_s < k) + ? static_cast( + rhs[rhs_offset + + rhs_indexer(g_s * b_st0 + g_j * b_st1)]) + : resT(0); + } + else { + slmB_t vec{}; +#pragma unroll + for (std::uint8_t lane_id = 0; lane_id < wi_delta_m; + ++lane_id) { + std::size_t g_j1 = g_j + lane_id; + vec[lane_id] = + (g_j1 < m && g_s < k) + ? static_cast( + rhs[rhs_offset + + rhs_indexer(g_s * b_st0 + g_j1 * b_st1)]) + : resT(0); + } + + local_B_block[vid] = vec; + } + } + + it.barrier(sycl::access::fence_space::local_space); + + i += local_i * wi_delta_n; + j += local_j * wi_delta_m; + + const std::size_t a_offset = local_i * wi_delta_k * wi_delta_n; + const std::size_t b_offset = local_j * wi_delta_k; + + static constexpr resT identity_(0); + + for (std::uint8_t private_i = 0; private_i < wi_delta_n; ++private_i) { + const std::size_t a_pr_offset = private_i * wi_delta_k; + + slmB_t local_sum(identity_); + for (std::size_t private_s = 0; private_s < wi_delta_k; + ++private_s) { + local_sum = local_sum + + (local_A_block[a_offset + a_pr_offset + private_s] * + local_B_block[b_offset + private_s]); + } + + const std::size_t gl_i = i + private_i; + + if constexpr (wi_delta_m == 1 && std::is_same_v) { + const std::size_t gl_j = j; + if (gl_i < n && gl_j < m) { + res[res_offset + res_indexer(gl_i * c_st0 + gl_j * c_st1) + + (block_s * n * m * batch_nelems)] = local_sum; + } + } + else { +#pragma unroll + for (std::uint8_t lane_id = 0; lane_id < wi_delta_m; + ++lane_id) { + const std::size_t gl_j = j + lane_id; + + if (gl_i < n && gl_j < m) { + res[res_offset + + res_indexer(gl_i * c_st0 + gl_j * c_st1) + + (block_s * n * m * batch_nelems)] = + local_sum[lane_id]; + } + } + } + } + } +}; + +template +class GemmBatchNoAtomicFunctorThreadK +{ +private: + const lhsT *lhs = nullptr; + const rhsT *rhs = nullptr; + resT *res = nullptr; + LocAccT workspace; + LocAccT local_B_block; + std::size_t n = 0; + std::size_t n_blocks = 0; + std::size_t delta_n = 0; + std::size_t k = 0; + std::size_t k_blocks = 0; + std::size_t delta_k = 0; + std::size_t n_wi = 0; + std::size_t m = 0; + std::size_t batch_nelems = 0; + BatchDimsIndexerT batch_indexer; + OuterInnerDimsIndexerT lhs_indexer; + OuterInnerDimsIndexerT rhs_indexer; + ResIndexerT res_indexer; + +public: + GemmBatchNoAtomicFunctorThreadK(const lhsT *lhs_, + const rhsT *rhs_, + resT *res_, + LocAccT workspace_, + LocAccT local_B_block_, + std::size_t n_, + std::size_t n_blocks_, + std::size_t delta_n_, + std::size_t k_, + std::size_t k_blocks_, + std::size_t delta_k_, + std::size_t n_wi_, + std::size_t m_, + std::size_t batch_nelems_, + const BatchDimsIndexerT &batch_indexer_, + const OuterInnerDimsIndexerT &lhs_indexer_, + const OuterInnerDimsIndexerT &rhs_indexer_, + const ResIndexerT &res_indexer_) + : lhs(lhs_), rhs(rhs_), res(res_), workspace(workspace_), + local_B_block(local_B_block_), n(n_), n_blocks(n_blocks_), + delta_n(delta_n_), k(k_), k_blocks(k_blocks_), delta_k(delta_k_), + n_wi(n_wi_), m(m_), batch_nelems(batch_nelems_), + batch_indexer(batch_indexer_), lhs_indexer(lhs_indexer_), + rhs_indexer(rhs_indexer_), res_indexer(res_indexer_) + { + } + + void operator()(sycl::nd_item<1> it) const + { + const std::size_t n_groups_per_batch = + it.get_group_range(0) / batch_nelems; + const std::size_t m_id = it.get_group_linear_id() / n_groups_per_batch; + const std::size_t gr_id = + it.get_group_linear_id() - m_id * n_groups_per_batch; + std::size_t lid = it.get_local_linear_id(); + + const auto &three_offsets_ = batch_indexer(static_cast(m_id)); + const auto &lhs_offset = three_offsets_.get_first_offset(); + const auto &rhs_offset = three_offsets_.get_second_offset(); + const auto &res_offset = three_offsets_.get_third_offset(); + + // lift gr_id -> (block_i, block_j, block_s) + // block_i moves fastest, then block_s, then block_j + + const std::size_t r_size = (n_blocks * k_blocks); + // 0 <= block_j < m_blocks + std::size_t block_j = gr_id / r_size; + // 0 <= block_r < n_blocks * k_blocks + std::size_t block_r = gr_id - block_j * r_size; + // 0 <= block_s < k_blocks + std::size_t block_s = block_r / n_blocks; + // 0 <= block_i < n_blocks + std::size_t block_i = block_r - block_s * n_blocks; + + std::size_t local_i = lid / (delta_k); // 0 <= local_i < delta_n + std::size_t local_s = + lid - local_i * (delta_k); // 0 <= local_s < delta_k + + std::size_t i = block_i * delta_n + local_i; + std::size_t j = m_groups * block_j; + std::size_t s = block_s * delta_k * n_wi + local_s; + + using accV_t = typename LocAccT::value_type; + + static constexpr resT identity_ = resT(0); + if (local_i == 0) { + for (std::size_t q = 0; q < n_wi * delta_k; q += delta_k) { + std::size_t sq = s + q; + std::size_t sqmj = sq * m + j; + + if constexpr (m_groups == 1 && std::is_same_v) { + local_B_block[local_s + q] = + (sq < k && j < m) + ? static_cast( + rhs[rhs_offset + rhs_indexer(sqmj)]) + : identity_; + } + else { + accV_t local_B_vec; +#pragma unroll + for (std::size_t vec_idx = 0; vec_idx < m_groups; + ++vec_idx) { + local_B_vec[vec_idx] = + (sq < k && j + vec_idx < m) + ? static_cast( + rhs[rhs_offset + + rhs_indexer(sqmj + vec_idx)]) + : identity_; + } + local_B_block[local_s + q] = local_B_vec; + } + } + } + + it.barrier(sycl::access::fence_space::local_space); + + std::size_t t_shift = block_s * delta_k * n_wi; + std::size_t global_s_offset = i * k + t_shift; + + accV_t private_sum(identity_); + static constexpr accV_t vec_identity_(identity_); + for (std::size_t t = local_s; t < local_B_block.size(); t += delta_k) { + private_sum += + ((i < n) && (t + t_shift < k)) + ? (static_cast( + lhs[lhs_offset + lhs_indexer(global_s_offset + t)]) * + local_B_block[t]) + : vec_identity_; + } + + std::size_t workspace_i_shift = local_i * delta_k; + workspace[workspace_i_shift + local_s] = private_sum; + + it.barrier(sycl::access::fence_space::local_space); + + if (local_s == 0 && i < n) { + accV_t local_sum(workspace[workspace_i_shift]); + for (std::size_t t = 1; t < delta_k; ++t) { + local_sum += workspace[workspace_i_shift + t]; + } + + const std::size_t total_offset = + res_offset + (block_s * n * m * batch_nelems); + + if constexpr (m_groups == 1 && std::is_same_v) { + res[total_offset + res_indexer(i * m + j)] = local_sum; + } + else { + res[total_offset + res_indexer(i * m + j)] = local_sum[0]; + +#pragma unroll + for (std::size_t vec_id = 1; vec_id < m_groups; ++vec_id) { + if (j + vec_id < m) { + res[total_offset + res_indexer(i * m + j + vec_id)] = + local_sum[vec_id]; + } + } + } + } + } +}; + +template +class gemm_batch_tree_k_krn; + +template +class gemm_batch_tree_nm_krn; + +namespace gemm_detail +{ + +template +sycl::event _gemm_tree_k_step(sycl::queue &exec_q, + const lhsTy *lhs_tp, + const rhsTy *rhs_tp, + resTy *res_tp, + const std::size_t batch_nelems, + const std::size_t n, + const std::size_t k, + const std::size_t m, + const std::size_t delta_n, + const std::size_t n_wi, + const std::size_t delta_k, + const BatchIndexerT &batch_indexer, + const LhsIndexerT &lhs_indexer, + const RhsIndexerT &rhs_indexer, + const ResIndexerT &res_indexer, + const std::vector &depends) +{ + static_assert(std::is_same_v); + + sycl::event gemm_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + const std::size_t n_blocks = (n + delta_n - 1) / delta_n; + const std::size_t k_blocks = + (k + n_wi * delta_k - 1) / (n_wi * delta_k); + const std::size_t m_blocks = (m + m_groups - 1) / m_groups; + + const std::size_t lws = delta_n * delta_k; + const std::size_t gws = + batch_nelems * n_blocks * m_blocks * k_blocks * lws; + + auto gRange = sycl::range<1>(gws); + auto lRange = sycl::range<1>(lws); + auto ndRange = sycl::nd_range<1>(gRange, lRange); + + using slmB_t = + typename std::conditional>::type; + + using LocAccT = sycl::local_accessor; + LocAccT local_B_block(n_wi * delta_k, cgh); + LocAccT workspace(delta_n * delta_k, cgh); + + using KernelName = + class gemm_batch_tree_k_krn; + + cgh.parallel_for( + ndRange, + GemmBatchNoAtomicFunctorThreadK( + lhs_tp, rhs_tp, res_tp, std::move(workspace), + std::move(local_B_block), n, n_blocks, delta_n, k, k_blocks, + delta_k, n_wi, m, batch_nelems, batch_indexer, lhs_indexer, + rhs_indexer, res_indexer)); + }); + return gemm_ev; +} + +} // end of namespace gemm_detail + +template +sycl::event + gemm_batch_tree_k_impl(sycl::queue &exec_q, + const lhsTy *lhs_tp, + const rhsTy *rhs_tp, + resTy *res_tp, + std::size_t batch_nelems, + std::size_t n, + std::size_t k, + std::size_t m, + int batch_nd, + const ssize_t *batch_shape_strides, + ssize_t lhs_batch_offset, + ssize_t rhs_batch_offset, + ssize_t res_batch_offset, + int inner_nd, + int lhs_outer_nd, + const ssize_t *lhs_outer_inner_shapes_strides, + int rhs_outer_nd, + const ssize_t *rhs_outer_inner_shapes_strides, + int res_outer_nd, + const ssize_t *res_outer_shapes_strides, + const ssize_t *res_shape_strides, + std::vector const &depends) +{ + std::size_t delta_k(4); + std::size_t n_wi(64); + std::size_t delta_n(32); + + const sycl::device &dev = exec_q.get_device(); + const std::size_t local_mem_size = + dev.get_info(); + const std::size_t reserved_slm_size = 512; + + gemm_detail::scale_gemm_k_parameters( + local_mem_size, reserved_slm_size, delta_k, + n_wi, // modified by reference + delta_n // modified by reference + ); + + if (k <= (delta_k * n_wi)) { + using OuterInnerDimsIndexerT = + dpctl::tensor::offset_utils::StridedIndexer; + const OuterInnerDimsIndexerT lhs_indexer( + inner_nd + lhs_outer_nd, 0, lhs_outer_inner_shapes_strides); + const OuterInnerDimsIndexerT rhs_indexer( + inner_nd + rhs_outer_nd, 0, rhs_outer_inner_shapes_strides); + const OuterInnerDimsIndexerT res_indexer(res_outer_nd, 0, + res_outer_shapes_strides); + using BatchDimsIndexerT = + dpctl::tensor::offset_utils::ThreeOffsets_StridedIndexer; + const BatchDimsIndexerT batch_indexer( + batch_nd, lhs_batch_offset, rhs_batch_offset, res_batch_offset, + batch_shape_strides); + + return gemm_detail::_gemm_tree_k_step< + lhsTy, rhsTy, resTy, BatchDimsIndexerT, OuterInnerDimsIndexerT, + OuterInnerDimsIndexerT, OuterInnerDimsIndexerT, m_groups>( + exec_q, lhs_tp, rhs_tp, res_tp, batch_nelems, n, k, m, delta_n, + n_wi, delta_k, batch_indexer, lhs_indexer, rhs_indexer, res_indexer, + depends); + } + else { + using ReductionOpT = + typename std::conditional, + sycl::logical_or, + sycl::plus>::type; + static constexpr resTy identity_val = + sycl::known_identity::value; + + std::size_t iter_nelems = batch_nelems * n * m; + std::size_t reduction_nelems = + (k + delta_k * n_wi - 1) / (delta_k * n_wi); + + // more than one work-group is needed, requires a + // temporary delta_k * n_wi elements processed along k, + // so if more to process use multiple + const auto &sg_sizes = + dev.get_info(); + std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); + + static constexpr std::size_t preferred_reductions_per_wi = 4; + std::size_t reductions_per_wi(preferred_reductions_per_wi); + + std::size_t reduction_groups = + (reduction_nelems + preferred_reductions_per_wi * wg - 1) / + (preferred_reductions_per_wi * wg); + + // max_max_wg prevents running out of resources on CPU + static constexpr std::size_t max_max_wg = 2048; + std::size_t max_wg = std::min( + max_max_wg, + dev.get_info() / 2); + + if (reduction_nelems <= preferred_reductions_per_wi * max_wg) { + auto tmp_owner = + dpctl::tensor::alloc_utils::smart_malloc_device( + iter_nelems * reduction_nelems, exec_q); + resTy *tmp = tmp_owner.get(); + + using OuterInnerDimsIndexerT = + dpctl::tensor::offset_utils::StridedIndexer; + using TmpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + const OuterInnerDimsIndexerT lhs_indexer( + inner_nd + lhs_outer_nd, 0, lhs_outer_inner_shapes_strides); + const OuterInnerDimsIndexerT rhs_indexer( + inner_nd + rhs_outer_nd, 0, rhs_outer_inner_shapes_strides); + static constexpr TmpIndexerT res_indexer{}; + + using dpctl::tensor::offset_utils::Strided1DIndexer; + using dpctl::tensor::offset_utils::StridedIndexer; + using dpctl::tensor::offset_utils::ThreeOffsets_CombinedIndexer; + using dpctl::tensor::offset_utils::UnpackedStridedIndexer; + using BatchDimsIndexerT = ThreeOffsets_CombinedIndexer< + StridedIndexer, UnpackedStridedIndexer, Strided1DIndexer>; + const StridedIndexer lhs_batch_indexer(batch_nd, lhs_batch_offset, + batch_shape_strides); + const UnpackedStridedIndexer rhs_batch_indexer( + batch_nd, rhs_batch_offset, batch_shape_strides, + batch_shape_strides + 2 * batch_nd); + const Strided1DIndexer tmp_batch_indexer( + /* size */ batch_nelems, + /* step */ n * m); + const BatchDimsIndexerT batch_indexer( + lhs_batch_indexer, rhs_batch_indexer, tmp_batch_indexer); + + sycl::event gemm_ev = gemm_detail::_gemm_tree_k_step< + lhsTy, rhsTy, resTy, BatchDimsIndexerT, OuterInnerDimsIndexerT, + OuterInnerDimsIndexerT, TmpIndexerT, m_groups>( + exec_q, lhs_tp, rhs_tp, tmp, batch_nelems, n, k, m, delta_n, + n_wi, delta_k, batch_indexer, lhs_indexer, rhs_indexer, + res_indexer, depends); + + sycl::event red_ev = single_reduction_for_gemm( + exec_q, tmp, res_tp, identity_val, iter_nelems, + reduction_nelems, reduction_groups, wg, max_wg, + preferred_reductions_per_wi, reductions_per_wi, + batch_nd + res_outer_nd, res_batch_offset, res_shape_strides, + {gemm_ev}); + + sycl::event cleanup_host_task_event = + dpctl::tensor::alloc_utils::async_smart_free(exec_q, {red_ev}, + tmp_owner); + + return cleanup_host_task_event; + } + else { + assert(reduction_groups > 1); + + const std::size_t tmp_alloc_size = + iter_nelems * ( + /* temp */ reduction_nelems + + /* first reduction temp */ reduction_groups); + + // get unique_ptr owning the temporary allocation + auto tmp_owner = + dpctl::tensor::alloc_utils::smart_malloc_device( + tmp_alloc_size, exec_q); + // get raw USM pointer + resTy *partially_reduced_tmp = tmp_owner.get(); + resTy *partially_reduced_tmp2 = + partially_reduced_tmp + reduction_nelems * iter_nelems; + ; + + using OuterInnerDimsIndexerT = + dpctl::tensor::offset_utils::StridedIndexer; + using TmpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + const OuterInnerDimsIndexerT lhs_indexer( + inner_nd + lhs_outer_nd, 0, lhs_outer_inner_shapes_strides); + const OuterInnerDimsIndexerT rhs_indexer( + inner_nd + rhs_outer_nd, 0, rhs_outer_inner_shapes_strides); + static constexpr TmpIndexerT res_indexer{}; + using dpctl::tensor::offset_utils::Strided1DIndexer; + using dpctl::tensor::offset_utils::StridedIndexer; + using dpctl::tensor::offset_utils::ThreeOffsets_CombinedIndexer; + using BatchDimsIndexerT = + ThreeOffsets_CombinedIndexer; + const StridedIndexer lhs_batch_indexer(batch_nd, lhs_batch_offset, + batch_shape_strides); + const StridedIndexer rhs_batch_indexer( + batch_nd, rhs_batch_offset, batch_shape_strides + 2 * batch_nd); + const Strided1DIndexer tmp_batch_indexer( + /* size */ batch_nelems, + /* step */ n * m); + const BatchDimsIndexerT batch_indexer( + lhs_batch_indexer, rhs_batch_indexer, tmp_batch_indexer); + + sycl::event gemm_ev = gemm_detail::_gemm_tree_k_step< + lhsTy, rhsTy, resTy, BatchDimsIndexerT, OuterInnerDimsIndexerT, + OuterInnerDimsIndexerT, TmpIndexerT, m_groups>( + exec_q, lhs_tp, rhs_tp, partially_reduced_tmp, batch_nelems, n, + k, m, delta_n, n_wi, delta_k, batch_indexer, lhs_indexer, + rhs_indexer, res_indexer, depends); + + sycl::event red_ev = tree_reduction_for_gemm( + exec_q, partially_reduced_tmp, partially_reduced_tmp2, res_tp, + identity_val, iter_nelems, reduction_nelems, reduction_groups, + wg, max_wg, preferred_reductions_per_wi, reductions_per_wi, + batch_nd + res_outer_nd, res_batch_offset, res_shape_strides, + {gemm_ev}); + + sycl::event cleanup_host_task_event = + dpctl::tensor::alloc_utils::async_smart_free(exec_q, {red_ev}, + tmp_owner); + + return cleanup_host_task_event; + } + } +} + +namespace gemm_detail +{ + +template +sycl::event _gemm_tree_nm_step(sycl::queue &exec_q, + const lhsTy *lhs_tp, + const rhsTy *rhs_tp, + resTy *res_tp, + const std::size_t batch_nelems, + const std::size_t n, + const std::size_t k, + const std::size_t m, + const std::uint32_t wg_delta_n, + const std::uint32_t wg_delta_m, + const std::uint32_t wi_delta_k, + const BatchIndexerT &batch_indexer, + const LhsIndexerT &lhs_indexer, + const RhsIndexerT &rhs_indexer, + const ResIndexerT &res_indexer, + const std::vector &depends) +{ + static_assert(std::is_same_v); + + sycl::event gemm_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + const std::size_t lws = wg_delta_n * wg_delta_m; + + const std::size_t n_blocks = + ((n + wi_delta_n * wg_delta_n - 1) / (wi_delta_n * wg_delta_n)); + const std::size_t k_blocks = ((k + wi_delta_k - 1) / wi_delta_k); + const std::size_t m_blocks = + ((m + wi_delta_m * wg_delta_m - 1) / (wi_delta_m * wg_delta_m)); + + const std::size_t gws = + batch_nelems * n_blocks * m_blocks * k_blocks * lws; + + auto gwsRange = sycl::range<1>(gws); + auto lwsRange = sycl::range<1>(lws); + auto ndRange = sycl::nd_range<1>(gwsRange, lwsRange); + + using slmB_t = + typename std::conditional>::type; + using LocAccT1 = sycl::local_accessor; + using LocAccT2 = sycl::local_accessor; + + const sycl::range<1> local_A_size((wi_delta_n * wg_delta_n) * + wi_delta_k); + const sycl::range<1> local_B_size(wi_delta_k * wg_delta_m); + + LocAccT1 local_A_block(local_A_size, cgh); + LocAccT2 local_B_block(local_B_size, cgh); + + using KernelName = + class gemm_batch_tree_nm_krn; + cgh.parallel_for( + ndRange, GemmBatchNoAtomicFunctorThreadNM< + lhsTy, rhsTy, resTy, LocAccT1, LocAccT2, LhsIndexerT, + ResIndexerT, BatchIndexerT, wi_delta_n, wi_delta_m>( + lhs_tp, rhs_tp, res_tp, std::move(local_A_block), + std::move(local_B_block), n, wg_delta_n, k, k_blocks, + wi_delta_k, m, m_blocks, wg_delta_m, batch_nelems, + batch_indexer, lhs_indexer, rhs_indexer, res_indexer)); + }); + return gemm_ev; +} + +} // end namespace gemm_detail + +template +sycl::event + gemm_batch_tree_nm_impl(sycl::queue &exec_q, + const lhsTy *lhs_tp, + const rhsTy *rhs_tp, + resTy *res_tp, + std::size_t batch_nelems, + std::size_t n, + std::size_t k, + std::size_t m, + int batch_nd, + const ssize_t *batch_shape_strides, + ssize_t lhs_batch_offset, + ssize_t rhs_batch_offset, + ssize_t res_batch_offset, + int inner_nd, + int lhs_outer_nd, + const ssize_t *lhs_outer_inner_shapes_strides, + int rhs_outer_nd, + const ssize_t *rhs_outer_inner_shapes_strides, + int res_outer_nd, + const ssize_t *res_outer_shapes_strides, + const ssize_t *res_shape_strides, + std::vector const &depends) +{ + static constexpr int wi_delta_n = 2; + std::size_t wg_delta_n(16); // rows of A processed in WG + std::size_t wg_delta_m(16); // rows of B processed in WG + std::size_t wi_delta_k(64); // Elements in K dimension processed by WI + + const sycl::device &dev = exec_q.get_device(); + const std::size_t local_mem_size = + dev.get_info(); + const std::size_t reserved_slm_size = 512; + + gemm_detail::scale_gemm_nm_parameters( + local_mem_size, reserved_slm_size, wi_delta_n, + wi_delta_k, // modified by reference + wg_delta_n, // modified by reference + wg_delta_m // modified by reference + ); + + // each group processes delta_k * n_wi + // items in a column, so no need for allocating + // temp memory if only one group is needed + if (k <= wi_delta_k) { + using OuterInnerDimsIndexerT = + dpctl::tensor::offset_utils::StridedIndexer; + const OuterInnerDimsIndexerT lhs_indexer( + inner_nd + lhs_outer_nd, 0, lhs_outer_inner_shapes_strides); + const OuterInnerDimsIndexerT rhs_indexer( + inner_nd + rhs_outer_nd, 0, rhs_outer_inner_shapes_strides); + const OuterInnerDimsIndexerT res_indexer(res_outer_nd, 0, + res_outer_shapes_strides); + using BatchDimsIndexerT = + dpctl::tensor::offset_utils::ThreeOffsets_StridedIndexer; + const BatchDimsIndexerT batch_indexer( + batch_nd, lhs_batch_offset, rhs_batch_offset, res_batch_offset, + batch_shape_strides); + + return gemm_detail::_gemm_tree_nm_step< + lhsTy, rhsTy, resTy, BatchDimsIndexerT, OuterInnerDimsIndexerT, + OuterInnerDimsIndexerT, OuterInnerDimsIndexerT, wi_delta_n, + wi_delta_m>(exec_q, lhs_tp, rhs_tp, res_tp, batch_nelems, n, k, m, + wg_delta_n, wg_delta_m, wi_delta_k, batch_indexer, + lhs_indexer, rhs_indexer, res_indexer, depends); + } + else { + using ReductionOpT = + typename std::conditional, + sycl::logical_or, + sycl::plus>::type; + static constexpr resTy identity_val = + sycl::known_identity::value; + std::size_t iter_nelems = batch_nelems * n * m; + std::size_t reduction_nelems = (k + wi_delta_k - 1) / wi_delta_k; + + // more than one work-group is needed, requires a temporary + // delta_k * n_wi elements processed along k, so if more to + // process use multiple + const auto &sg_sizes = + dev.get_info(); + std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); + + static constexpr std::size_t preferred_reductions_per_wi = 4; + std::size_t reductions_per_wi(preferred_reductions_per_wi); + + std::size_t reduction_groups = + (reduction_nelems + preferred_reductions_per_wi * wg - 1) / + (preferred_reductions_per_wi * wg); + + std::size_t max_wg = reduction_detail::get_work_group_size(dev); + + if (reduction_nelems <= preferred_reductions_per_wi * max_wg) { + auto tmp_owner = + dpctl::tensor::alloc_utils::smart_malloc_device( + iter_nelems * reduction_nelems, exec_q); + resTy *tmp = tmp_owner.get(); + + using OuterInnerDimsIndexerT = + dpctl::tensor::offset_utils::StridedIndexer; + using TmpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + const OuterInnerDimsIndexerT lhs_indexer( + inner_nd + lhs_outer_nd, 0, lhs_outer_inner_shapes_strides); + const OuterInnerDimsIndexerT rhs_indexer( + inner_nd + rhs_outer_nd, 0, rhs_outer_inner_shapes_strides); + static constexpr TmpIndexerT res_indexer{}; + + using dpctl::tensor::offset_utils::Strided1DIndexer; + using dpctl::tensor::offset_utils::StridedIndexer; + using dpctl::tensor::offset_utils::ThreeOffsets_CombinedIndexer; + using dpctl::tensor::offset_utils::UnpackedStridedIndexer; + using BatchDimsIndexerT = ThreeOffsets_CombinedIndexer< + StridedIndexer, UnpackedStridedIndexer, Strided1DIndexer>; + const StridedIndexer lhs_batch_indexer(batch_nd, lhs_batch_offset, + batch_shape_strides); + const UnpackedStridedIndexer rhs_batch_indexer( + batch_nd, rhs_batch_offset, batch_shape_strides, + batch_shape_strides + 2 * batch_nd); + const Strided1DIndexer tmp_batch_indexer( + /* size */ batch_nelems, + /* step */ n * m); + const BatchDimsIndexerT batch_indexer( + lhs_batch_indexer, rhs_batch_indexer, tmp_batch_indexer); + + sycl::event gemm_ev = gemm_detail::_gemm_tree_nm_step< + lhsTy, rhsTy, resTy, BatchDimsIndexerT, OuterInnerDimsIndexerT, + OuterInnerDimsIndexerT, TmpIndexerT, wi_delta_n, wi_delta_m>( + exec_q, lhs_tp, rhs_tp, tmp, batch_nelems, n, k, m, wg_delta_n, + wg_delta_m, wi_delta_k, batch_indexer, lhs_indexer, rhs_indexer, + res_indexer, depends); + + sycl::event red_ev = single_reduction_for_gemm( + exec_q, tmp, res_tp, identity_val, iter_nelems, + reduction_nelems, reduction_groups, wg, max_wg, + preferred_reductions_per_wi, reductions_per_wi, + batch_nd + res_outer_nd, res_batch_offset, res_shape_strides, + {gemm_ev}); + + sycl::event cleanup_host_task_event = + dpctl::tensor::alloc_utils::async_smart_free(exec_q, {red_ev}, + tmp_owner); + + return cleanup_host_task_event; + } + else { + assert(reduction_groups > 1); + + const std::size_t tmp_alloc_size = + iter_nelems * (/* temp */ reduction_nelems + + /* first reduction temp */ reduction_groups); + + auto tmp_owner = + dpctl::tensor::alloc_utils::smart_malloc_device( + tmp_alloc_size, exec_q); + + resTy *partially_reduced_tmp = tmp_owner.get(); + resTy *partially_reduced_tmp2 = + partially_reduced_tmp + reduction_nelems * iter_nelems; + ; + + using OuterInnerDimsIndexerT = + dpctl::tensor::offset_utils::StridedIndexer; + using TmpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + + const OuterInnerDimsIndexerT lhs_indexer( + inner_nd + lhs_outer_nd, 0, lhs_outer_inner_shapes_strides); + const OuterInnerDimsIndexerT rhs_indexer( + inner_nd + rhs_outer_nd, 0, rhs_outer_inner_shapes_strides); + static constexpr TmpIndexerT res_indexer{}; + + using dpctl::tensor::offset_utils::Strided1DIndexer; + using dpctl::tensor::offset_utils::StridedIndexer; + using dpctl::tensor::offset_utils::ThreeOffsets_CombinedIndexer; + using dpctl::tensor::offset_utils::UnpackedStridedIndexer; + using BatchDimsIndexerT = ThreeOffsets_CombinedIndexer< + StridedIndexer, UnpackedStridedIndexer, Strided1DIndexer>; + + const StridedIndexer lhs_batch_indexer(batch_nd, lhs_batch_offset, + batch_shape_strides); + const UnpackedStridedIndexer rhs_batch_indexer( + batch_nd, rhs_batch_offset, batch_shape_strides, + batch_shape_strides + 2 * batch_nd); + const Strided1DIndexer tmp_batch_indexer( + /* size */ batch_nelems, + /* step */ n * m); + const BatchDimsIndexerT batch_indexer( + lhs_batch_indexer, rhs_batch_indexer, tmp_batch_indexer); + + sycl::event gemm_ev = gemm_detail::_gemm_tree_nm_step< + lhsTy, rhsTy, resTy, BatchDimsIndexerT, OuterInnerDimsIndexerT, + OuterInnerDimsIndexerT, TmpIndexerT, wi_delta_n, wi_delta_m>( + exec_q, lhs_tp, rhs_tp, partially_reduced_tmp, batch_nelems, n, + k, m, wg_delta_n, wg_delta_m, wi_delta_k, batch_indexer, + lhs_indexer, rhs_indexer, res_indexer, depends); + + sycl::event red_ev = tree_reduction_for_gemm( + exec_q, partially_reduced_tmp, partially_reduced_tmp2, res_tp, + identity_val, iter_nelems, reduction_nelems, reduction_groups, + wg, max_wg, preferred_reductions_per_wi, reductions_per_wi, + batch_nd + res_outer_nd, res_batch_offset, res_shape_strides, + {gemm_ev}); + + sycl::event cleanup_host_task_event = + dpctl::tensor::alloc_utils::async_smart_free(exec_q, {red_ev}, + tmp_owner); + + return cleanup_host_task_event; + } + } +} + +template +sycl::event gemm_batch_nm_impl(sycl::queue &exec_q, + const lhsTy *lhs_tp, + const rhsTy *rhs_tp, + resTy *res_tp, + std::size_t batch_nelems, + std::size_t n, + std::size_t k, + std::size_t m, + int batch_nd, + const ssize_t *batch_shape_strides, + ssize_t lhs_batch_offset, + ssize_t rhs_batch_offset, + ssize_t res_batch_offset, + int inner_nd, + int lhs_outer_nd, + const ssize_t *lhs_outer_inner_shapes_strides, + int rhs_outer_nd, + const ssize_t *rhs_outer_inner_shapes_strides, + int res_outer_nd, + const ssize_t *res_outer_shapes_strides, + const ssize_t *res_shape_strides, + std::vector const &depends = {}) +{ + + using OuterInnerDimsIndexerT = dpctl::tensor::offset_utils::StridedIndexer; + const OuterInnerDimsIndexerT lhs_indexer(inner_nd + lhs_outer_nd, 0, + lhs_outer_inner_shapes_strides); + const OuterInnerDimsIndexerT rhs_indexer(inner_nd + rhs_outer_nd, 0, + rhs_outer_inner_shapes_strides); + const OuterInnerDimsIndexerT res_indexer(res_outer_nd, 0, + res_outer_shapes_strides); + + using BatchDimsIndexerT = + dpctl::tensor::offset_utils::ThreeOffsets_StridedIndexer; + const BatchDimsIndexerT batch_indexer(batch_nd, lhs_batch_offset, + rhs_batch_offset, res_batch_offset, + batch_shape_strides); + + sycl::event gemm_ev = gemm_detail::_gemm_batch_nm_impl< + lhsTy, rhsTy, resTy, BatchDimsIndexerT, OuterInnerDimsIndexerT, + OuterInnerDimsIndexerT, OuterInnerDimsIndexerT>( + exec_q, lhs_tp, rhs_tp, res_tp, batch_nelems, n, k, m, batch_indexer, + lhs_indexer, rhs_indexer, res_indexer, depends); + + return gemm_ev; +} + +template +class gemm_batch_tree_empty_krn; + +template +sycl::event gemm_batch_tree_impl(sycl::queue &exec_q, + const char *lhs_cp, + const char *rhs_cp, + char *res_cp, + std::size_t batch_nelems, + std::size_t n, + std::size_t k, + std::size_t m, + int batch_nd, + const ssize_t *batch_shape_strides, + ssize_t lhs_batch_offset, + ssize_t rhs_batch_offset, + ssize_t res_batch_offset, + int inner_nd, + int lhs_outer_nd, + const ssize_t *lhs_outer_inner_shapes_strides, + int rhs_outer_nd, + const ssize_t *rhs_outer_inner_shapes_strides, + int res_outer_nd, + const ssize_t *res_outer_shapes_strides, + const ssize_t *res_shape_strides, + std::vector const &depends = {}) +{ + const lhsTy *lhs_tp = reinterpret_cast(lhs_cp); + const rhsTy *rhs_tp = reinterpret_cast(rhs_cp); + resTy *res_tp = reinterpret_cast(res_cp); + + const std::size_t min_nm = std::min(n, m); + const std::size_t max_nm = std::max(n, m); + + if (min_nm > 0 && (max_nm >= ((64 * 1024) / min_nm))) { + return gemm_batch_nm_impl( + exec_q, lhs_tp, rhs_tp, res_tp, batch_nelems, n, k, m, batch_nd, + batch_shape_strides, lhs_batch_offset, rhs_batch_offset, + res_batch_offset, inner_nd, lhs_outer_nd, + lhs_outer_inner_shapes_strides, rhs_outer_nd, + rhs_outer_inner_shapes_strides, res_outer_nd, + res_outer_shapes_strides, res_shape_strides, depends); + } + + if (k == 0) { + sycl::event gemm_batch_no_reduction_ev = + exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + using IndexerT = dpctl::tensor::offset_utils::StridedIndexer; + const IndexerT res_indexer(batch_nd + res_outer_nd, + res_batch_offset, res_shape_strides); + using InitKernelName = + class gemm_batch_tree_empty_krn; + cgh.parallel_for( + sycl::range<1>(n * m * batch_nelems), [=](sycl::id<1> id) { + auto res_offset = res_indexer(id[0]); + res_tp[res_offset] = resTy(0); + }); + }); + return gemm_batch_no_reduction_ev; + } + + if (max_nm < 64) { + using dpctl::tensor::type_utils::is_complex; + if constexpr (!is_complex::value) { + if (m < 4) { + static constexpr std::uint32_t m_groups_one = 1; + return gemm_batch_tree_k_impl( + exec_q, lhs_tp, rhs_tp, res_tp, batch_nelems, n, k, m, + batch_nd, batch_shape_strides, lhs_batch_offset, + rhs_batch_offset, res_batch_offset, inner_nd, lhs_outer_nd, + lhs_outer_inner_shapes_strides, rhs_outer_nd, + rhs_outer_inner_shapes_strides, res_outer_nd, + res_outer_shapes_strides, res_shape_strides, depends); + } + else { + static constexpr std::uint32_t m_groups_four = 4; + return gemm_batch_tree_k_impl( + exec_q, lhs_tp, rhs_tp, res_tp, batch_nelems, n, k, m, + batch_nd, batch_shape_strides, lhs_batch_offset, + rhs_batch_offset, res_batch_offset, inner_nd, lhs_outer_nd, + lhs_outer_inner_shapes_strides, rhs_outer_nd, + rhs_outer_inner_shapes_strides, res_outer_nd, + res_outer_shapes_strides, res_shape_strides, depends); + } + } + else { + static constexpr std::uint32_t m_groups_one = 1; + return gemm_batch_tree_k_impl( + exec_q, lhs_tp, rhs_tp, res_tp, batch_nelems, n, k, m, batch_nd, + batch_shape_strides, lhs_batch_offset, rhs_batch_offset, + res_batch_offset, inner_nd, lhs_outer_nd, + lhs_outer_inner_shapes_strides, rhs_outer_nd, + rhs_outer_inner_shapes_strides, res_outer_nd, + res_outer_shapes_strides, res_shape_strides, depends); + } + } + else { // m > 1, n > k or m > k + using dpctl::tensor::type_utils::is_complex; + if constexpr (!is_complex::value) { + static constexpr std::uint32_t m_groups_four = 4; + return gemm_batch_tree_nm_impl( + exec_q, lhs_tp, rhs_tp, res_tp, batch_nelems, n, k, m, batch_nd, + batch_shape_strides, lhs_batch_offset, rhs_batch_offset, + res_batch_offset, inner_nd, lhs_outer_nd, + lhs_outer_inner_shapes_strides, rhs_outer_nd, + rhs_outer_inner_shapes_strides, res_outer_nd, + res_outer_shapes_strides, res_shape_strides, depends); + } + else { // m > 1, n > k or m > k, resTy complex + static constexpr std::uint32_t m_groups_one = 1; + return gemm_batch_tree_nm_impl( + exec_q, lhs_tp, rhs_tp, res_tp, batch_nelems, n, k, m, batch_nd, + batch_shape_strides, lhs_batch_offset, rhs_batch_offset, + res_batch_offset, inner_nd, lhs_outer_nd, + lhs_outer_inner_shapes_strides, rhs_outer_nd, + rhs_outer_inner_shapes_strides, res_outer_nd, + res_outer_shapes_strides, res_shape_strides, depends); + } + } +} + +template +sycl::event + gemm_batch_contig_tree_k_impl(sycl::queue &exec_q, + const lhsTy *lhs_tp, + const rhsTy *rhs_tp, + resTy *res_tp, + std::size_t batch_nelems, + std::size_t n, + std::size_t k, + std::size_t m, + std::vector const &depends) +{ + std::size_t delta_k(4); + std::size_t n_wi(64); + std::size_t delta_n(32); + + const sycl::device &dev = exec_q.get_device(); + const std::size_t local_mem_size = + dev.get_info(); + const std::size_t reserved_slm_size = 512; + + gemm_detail::scale_gemm_k_parameters( + local_mem_size, reserved_slm_size, delta_k, + n_wi, // modified by reference + delta_n // modified by reference + ); + + if (k <= (delta_k * n_wi)) { + using OuterInnerDimsIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + static constexpr OuterInnerDimsIndexerT lhs_indexer{}; + static constexpr OuterInnerDimsIndexerT rhs_indexer{}; + static constexpr OuterInnerDimsIndexerT res_indexer{}; + + using dpctl::tensor::offset_utils::Strided1DIndexer; + using dpctl::tensor::offset_utils::ThreeOffsets_CombinedIndexer; + using BatchDimsIndexerT = + ThreeOffsets_CombinedIndexer; + + using dpctl::tensor::offset_utils::Strided1DIndexer; + const BatchDimsIndexerT batch_indexer( + Strided1DIndexer{/* size */ batch_nelems, + /* step */ n * k}, + Strided1DIndexer{/* size */ batch_nelems, + /* step */ k * m}, + Strided1DIndexer{/* size */ batch_nelems, + /* step */ n * m}); + + return gemm_detail::_gemm_tree_k_step< + lhsTy, rhsTy, resTy, BatchDimsIndexerT, OuterInnerDimsIndexerT, + OuterInnerDimsIndexerT, OuterInnerDimsIndexerT, m_groups>( + exec_q, lhs_tp, rhs_tp, res_tp, batch_nelems, n, k, m, delta_n, + n_wi, delta_k, batch_indexer, lhs_indexer, rhs_indexer, res_indexer, + depends); + } + else { + using ReductionOpT = + typename std::conditional, + sycl::logical_or, + sycl::plus>::type; + static constexpr resTy identity_val = + sycl::known_identity::value; + + std::size_t iter_nelems = batch_nelems * n * m; + std::size_t reduction_nelems = + (k + delta_k * n_wi - 1) / (delta_k * n_wi); + + // more than one work-group is needed, requires a + // temporary delta_k * n_wi elements processed along k, + // so if more to process use multiple + const auto &sg_sizes = + dev.get_info(); + std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); + + static constexpr std::size_t preferred_reductions_per_wi = 4; + std::size_t reductions_per_wi(preferred_reductions_per_wi); + + std::size_t reduction_groups = + (reduction_nelems + preferred_reductions_per_wi * wg - 1) / + (preferred_reductions_per_wi * wg); + + std::size_t max_wg = reduction_detail::get_work_group_size(dev); + + if (reduction_nelems <= preferred_reductions_per_wi * max_wg) { + auto tmp_owner = + dpctl::tensor::alloc_utils::smart_malloc_device( + iter_nelems * reduction_nelems, exec_q); + resTy *tmp = tmp_owner.get(); + + using OuterInnerDimsIndexerT = + dpctl::tensor::offset_utils::NoOpIndexer; + static constexpr OuterInnerDimsIndexerT lhs_indexer{}; + static constexpr OuterInnerDimsIndexerT rhs_indexer{}; + static constexpr OuterInnerDimsIndexerT tmp_indexer{}; + using dpctl::tensor::offset_utils::Strided1DIndexer; + using dpctl::tensor::offset_utils::ThreeOffsets_CombinedIndexer; + using BatchDimsIndexerT = + ThreeOffsets_CombinedIndexer; + + const BatchDimsIndexerT batch_indexer( + Strided1DIndexer{/* size */ batch_nelems, + /* step */ n * k}, + Strided1DIndexer{/* size */ batch_nelems, + /* step */ k * m}, + Strided1DIndexer{/* size */ batch_nelems, + /* step */ n * m}); + + sycl::event gemm_ev = gemm_detail::_gemm_tree_k_step< + lhsTy, rhsTy, resTy, BatchDimsIndexerT, OuterInnerDimsIndexerT, + OuterInnerDimsIndexerT, OuterInnerDimsIndexerT, m_groups>( + exec_q, lhs_tp, rhs_tp, tmp, batch_nelems, n, k, m, delta_n, + n_wi, delta_k, batch_indexer, lhs_indexer, rhs_indexer, + tmp_indexer, depends); + + sycl::event red_ev = + single_reduction_for_gemm_contig( + exec_q, tmp, res_tp, identity_val, iter_nelems, + reduction_nelems, reduction_groups, wg, max_wg, + preferred_reductions_per_wi, reductions_per_wi, {gemm_ev}); + + sycl::event cleanup_host_task_event = + dpctl::tensor::alloc_utils::async_smart_free(exec_q, {red_ev}, + tmp_owner); + + return cleanup_host_task_event; + } + else { + assert(reduction_groups > 1); + + const std::size_t tmp_alloc_size = + iter_nelems * (/* temp */ reduction_nelems + + /* first reduction temp */ reduction_groups); + + auto tmp_owner = + dpctl::tensor::alloc_utils::smart_malloc_device( + tmp_alloc_size, exec_q); + + resTy *partially_reduced_tmp = tmp_owner.get(); + resTy *partially_reduced_tmp2 = + partially_reduced_tmp + reduction_nelems * iter_nelems; + + using OuterInnerDimsIndexerT = + dpctl::tensor::offset_utils::NoOpIndexer; + static constexpr OuterInnerDimsIndexerT lhs_indexer{}; + static constexpr OuterInnerDimsIndexerT rhs_indexer{}; + static constexpr OuterInnerDimsIndexerT tmp_indexer{}; + using dpctl::tensor::offset_utils::Strided1DIndexer; + using dpctl::tensor::offset_utils::ThreeOffsets_CombinedIndexer; + using BatchDimsIndexerT = + ThreeOffsets_CombinedIndexer; + + const BatchDimsIndexerT batch_indexer( + Strided1DIndexer{/* size */ batch_nelems, + /* step */ n * k}, + Strided1DIndexer{/* size */ batch_nelems, + /* step */ k * m}, + Strided1DIndexer{/* size */ batch_nelems, + /* step */ n * m}); + + sycl::event gemm_ev = gemm_detail::_gemm_tree_k_step< + lhsTy, rhsTy, resTy, BatchDimsIndexerT, OuterInnerDimsIndexerT, + OuterInnerDimsIndexerT, OuterInnerDimsIndexerT, m_groups>( + exec_q, lhs_tp, rhs_tp, partially_reduced_tmp, batch_nelems, n, + k, m, delta_n, n_wi, delta_k, batch_indexer, lhs_indexer, + rhs_indexer, tmp_indexer, depends); + + sycl::event red_ev = + tree_reduction_for_gemm_contig( + exec_q, partially_reduced_tmp, partially_reduced_tmp2, + res_tp, identity_val, iter_nelems, reduction_nelems, + reduction_groups, wg, max_wg, preferred_reductions_per_wi, + reductions_per_wi, {gemm_ev}); + + sycl::event cleanup_host_task_event = + dpctl::tensor::alloc_utils::async_smart_free(exec_q, {red_ev}, + tmp_owner); + + return cleanup_host_task_event; + } + } +} + +template +sycl::event + gemm_batch_contig_tree_nm_impl(sycl::queue &exec_q, + const lhsTy *lhs_tp, + const rhsTy *rhs_tp, + resTy *res_tp, + std::size_t batch_nelems, + std::size_t n, + std::size_t k, + std::size_t m, + std::vector const &depends) +{ + static constexpr int wi_delta_n = 2; + std::size_t wg_delta_n(16); // rows of A processed in WG + std::size_t wg_delta_m(16); // rows of B processed in WG + std::size_t wi_delta_k(64); // Elements in K dimension processed by WI + + const sycl::device &dev = exec_q.get_device(); + const std::size_t local_mem_size = + dev.get_info(); + const std::size_t reserved_slm_size = 512; + + gemm_detail::scale_gemm_nm_parameters( + local_mem_size, reserved_slm_size, wi_delta_n, + wi_delta_k, // modified by reference + wg_delta_n, // modified by reference + wg_delta_m // modified by reference + ); + + // each group processes delta_k * n_wi + // items in a column, so no need for allocating + // temp memory if only one group is needed + if (k <= wi_delta_k) { + using OuterInnerDimsIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + static constexpr OuterInnerDimsIndexerT lhs_indexer{}; + static constexpr OuterInnerDimsIndexerT rhs_indexer{}; + static constexpr OuterInnerDimsIndexerT res_indexer{}; + + using dpctl::tensor::offset_utils::Strided1DIndexer; + using dpctl::tensor::offset_utils::ThreeOffsets_CombinedIndexer; + using BatchDimsIndexerT = + ThreeOffsets_CombinedIndexer; + + const BatchDimsIndexerT batch_indexer( + Strided1DIndexer{/* size */ batch_nelems, + /* step */ n * k}, + Strided1DIndexer{/* size */ batch_nelems, + /* step */ k * m}, + Strided1DIndexer{/* size */ batch_nelems, + /* step */ n * m}); + + return gemm_detail::_gemm_tree_nm_step< + lhsTy, rhsTy, resTy, BatchDimsIndexerT, OuterInnerDimsIndexerT, + OuterInnerDimsIndexerT, OuterInnerDimsIndexerT, wi_delta_n, + wi_delta_m>(exec_q, lhs_tp, rhs_tp, res_tp, batch_nelems, n, k, m, + wg_delta_n, wg_delta_m, wi_delta_k, batch_indexer, + lhs_indexer, rhs_indexer, res_indexer, depends); + } + else { + using ReductionOpT = + typename std::conditional, + sycl::logical_or, + sycl::plus>::type; + static constexpr resTy identity_val = + sycl::known_identity::value; + std::size_t iter_nelems = batch_nelems * n * m; + std::size_t reduction_nelems = (k + wi_delta_k - 1) / wi_delta_k; + + // more than one work-group is needed, requires a temporary + // delta_k * n_wi elements processed along k, so if more to + // process use multiple + const auto &sg_sizes = + dev.get_info(); + std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); + + static constexpr std::size_t preferred_reductions_per_wi = 4; + std::size_t reductions_per_wi(preferred_reductions_per_wi); + + std::size_t reduction_groups = + (reduction_nelems + preferred_reductions_per_wi * wg - 1) / + (preferred_reductions_per_wi * wg); + + std::size_t max_wg = reduction_detail::get_work_group_size(dev); + + if (reduction_nelems <= preferred_reductions_per_wi * max_wg) { + auto tmp_owner = + dpctl::tensor::alloc_utils::smart_malloc_device( + iter_nelems * reduction_nelems, exec_q); + + resTy *tmp = tmp_owner.get(); + + using OuterInnerDimsIndexerT = + dpctl::tensor::offset_utils::NoOpIndexer; + static constexpr OuterInnerDimsIndexerT lhs_indexer{}; + static constexpr OuterInnerDimsIndexerT rhs_indexer{}; + static constexpr OuterInnerDimsIndexerT tmp_indexer{}; + + using dpctl::tensor::offset_utils::Strided1DIndexer; + using dpctl::tensor::offset_utils::ThreeOffsets_CombinedIndexer; + using BatchDimsIndexerT = + ThreeOffsets_CombinedIndexer; + + const BatchDimsIndexerT batch_indexer( + Strided1DIndexer{/* size */ batch_nelems, + /* step */ n * k}, + Strided1DIndexer{/* size */ batch_nelems, + /* step */ k * m}, + Strided1DIndexer{/* size */ batch_nelems, + /* step */ n * m}); + + sycl::event gemm_ev = gemm_detail::_gemm_tree_nm_step< + lhsTy, rhsTy, resTy, BatchDimsIndexerT, OuterInnerDimsIndexerT, + OuterInnerDimsIndexerT, OuterInnerDimsIndexerT, wi_delta_n, + wi_delta_m>(exec_q, lhs_tp, rhs_tp, tmp, batch_nelems, n, k, m, + wg_delta_n, wg_delta_m, wi_delta_k, batch_indexer, + lhs_indexer, rhs_indexer, tmp_indexer, depends); + + sycl::event red_ev = + single_reduction_for_gemm_contig( + exec_q, tmp, res_tp, identity_val, iter_nelems, + reduction_nelems, reduction_groups, wg, max_wg, + preferred_reductions_per_wi, reductions_per_wi, {gemm_ev}); + + sycl::event cleanup_host_task_event = + dpctl::tensor::alloc_utils::async_smart_free(exec_q, {red_ev}, + tmp_owner); + + return cleanup_host_task_event; + } + else { + assert(reduction_groups > 1); + + const std::size_t tmp_alloc_size = + iter_nelems * (/* temp */ reduction_nelems + + /* first reduction temp */ reduction_groups); + + auto tmp_owner = + dpctl::tensor::alloc_utils::smart_malloc_device( + tmp_alloc_size, exec_q); + + resTy *partially_reduced_tmp = tmp_owner.get(); + resTy *partially_reduced_tmp2 = + partially_reduced_tmp + reduction_nelems * iter_nelems; + + using OuterInnerDimsIndexerT = + dpctl::tensor::offset_utils::NoOpIndexer; + static constexpr OuterInnerDimsIndexerT lhs_indexer{}; + static constexpr OuterInnerDimsIndexerT rhs_indexer{}; + static constexpr OuterInnerDimsIndexerT tmp_indexer{}; + + using dpctl::tensor::offset_utils::Strided1DIndexer; + using dpctl::tensor::offset_utils::ThreeOffsets_CombinedIndexer; + using BatchDimsIndexerT = + ThreeOffsets_CombinedIndexer; + + const BatchDimsIndexerT batch_indexer( + Strided1DIndexer{/* size */ batch_nelems, + /* step */ n * k}, + Strided1DIndexer{/* size */ batch_nelems, + /* step */ k * m}, + Strided1DIndexer{/* size */ batch_nelems, + /* step */ n * m}); + + sycl::event gemm_ev = gemm_detail::_gemm_tree_nm_step< + lhsTy, rhsTy, resTy, BatchDimsIndexerT, OuterInnerDimsIndexerT, + OuterInnerDimsIndexerT, OuterInnerDimsIndexerT, wi_delta_n, + wi_delta_m>(exec_q, lhs_tp, rhs_tp, partially_reduced_tmp, + batch_nelems, n, k, m, wg_delta_n, wg_delta_m, + wi_delta_k, batch_indexer, lhs_indexer, rhs_indexer, + tmp_indexer, depends); + + sycl::event red_ev = + tree_reduction_for_gemm_contig( + exec_q, partially_reduced_tmp, partially_reduced_tmp2, + res_tp, identity_val, iter_nelems, reduction_nelems, + reduction_groups, wg, max_wg, preferred_reductions_per_wi, + reductions_per_wi, {gemm_ev}); + + sycl::event cleanup_host_task_event = + dpctl::tensor::alloc_utils::async_smart_free(exec_q, {red_ev}, + tmp_owner); + + return cleanup_host_task_event; + } + } +} + +template +sycl::event gemm_nm_impl(sycl::queue &exec_q, + const lhsTy *lhs_tp, + const rhsTy *rhs_tp, + resTy *res_tp, + std::size_t n, + std::size_t k, + std::size_t m, + int inner_nd, + int lhs_outer_nd, + const ssize_t *lhs_shape_strides, + int rhs_outer_nd, + const ssize_t *rhs_shape_strides, + int res_outer_nd, + const ssize_t *res_shape_strides, + std::vector const &depends = {}) +{ + using OuterInnerDimsIndexerT = dpctl::tensor::offset_utils::StridedIndexer; + const OuterInnerDimsIndexerT lhs_indexer(inner_nd + lhs_outer_nd, 0, + lhs_shape_strides); + const OuterInnerDimsIndexerT rhs_indexer(inner_nd + rhs_outer_nd, 0, + rhs_shape_strides); + const OuterInnerDimsIndexerT res_indexer(res_outer_nd, 0, + res_shape_strides); + + using BatchDimsIndexerT = + dpctl::tensor::offset_utils::ThreeZeroOffsets_Indexer; + static constexpr BatchDimsIndexerT batch_indexer{}; + + static constexpr std::size_t single_batch_nelems = 1; + + sycl::event gemm_ev = gemm_detail::_gemm_batch_nm_impl< + lhsTy, rhsTy, resTy, BatchDimsIndexerT, OuterInnerDimsIndexerT, + OuterInnerDimsIndexerT, OuterInnerDimsIndexerT>( + exec_q, lhs_tp, rhs_tp, res_tp, single_batch_nelems, n, k, m, + batch_indexer, lhs_indexer, rhs_indexer, res_indexer, depends); + + return gemm_ev; +} + +template +sycl::event + gemm_batch_nm_contig_impl(sycl::queue &exec_q, + const lhsTy *lhs_tp, + const rhsTy *rhs_tp, + resTy *res_tp, + std::size_t batch_nelems, + std::size_t n, + std::size_t k, + std::size_t m, + std::vector const &depends = {}) +{ + using OuterInnerDimsIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + static constexpr OuterInnerDimsIndexerT lhs_indexer{}; + static constexpr OuterInnerDimsIndexerT rhs_indexer{}; + static constexpr OuterInnerDimsIndexerT res_indexer{}; + + static constexpr std::size_t single_batch_nelems = 1; + if (batch_nelems == single_batch_nelems) { + using BatchDimsIndexerT = + dpctl::tensor::offset_utils::ThreeZeroOffsets_Indexer; + static constexpr BatchDimsIndexerT batch_indexer{}; + + sycl::event gemm_ev = gemm_detail::_gemm_batch_nm_impl< + lhsTy, rhsTy, resTy, BatchDimsIndexerT, OuterInnerDimsIndexerT, + OuterInnerDimsIndexerT, OuterInnerDimsIndexerT>( + exec_q, lhs_tp, rhs_tp, res_tp, single_batch_nelems, n, k, m, + batch_indexer, lhs_indexer, rhs_indexer, res_indexer, depends); + + return gemm_ev; + } + else { + using dpctl::tensor::offset_utils::Strided1DIndexer; + using dpctl::tensor::offset_utils::ThreeOffsets_CombinedIndexer; + using BatchDimsIndexerT = + ThreeOffsets_CombinedIndexer; + + using dpctl::tensor::offset_utils::Strided1DIndexer; + + const BatchDimsIndexerT batch_indexer( + Strided1DIndexer{/* size */ batch_nelems, + /* step */ n * k}, + Strided1DIndexer{/* size */ batch_nelems, + /* step */ k * m}, + Strided1DIndexer{/* size */ batch_nelems, + /* step */ n * m}); + + sycl::event gemm_ev = gemm_detail::_gemm_batch_nm_impl< + lhsTy, rhsTy, resTy, BatchDimsIndexerT, OuterInnerDimsIndexerT, + OuterInnerDimsIndexerT, OuterInnerDimsIndexerT>( + exec_q, lhs_tp, rhs_tp, res_tp, batch_nelems, n, k, m, + batch_indexer, lhs_indexer, rhs_indexer, res_indexer, depends); + + return gemm_ev; + } +} + +template +sycl::event + gemm_batch_contig_tree_impl(sycl::queue &exec_q, + const char *lhs_cp, + const char *rhs_cp, + char *res_cp, + std::size_t batch_nelems, + std::size_t n, + std::size_t k, + std::size_t m, + ssize_t lhs_batch_offset, + ssize_t rhs_batch_offset, + ssize_t res_batch_offset, + std::vector const &depends = {}) +{ + const lhsTy *lhs_tp = + reinterpret_cast(lhs_cp) + lhs_batch_offset; + const rhsTy *rhs_tp = + reinterpret_cast(rhs_cp) + rhs_batch_offset; + resTy *res_tp = reinterpret_cast(res_cp) + res_batch_offset; + + const std::size_t min_nm = std::min(n, m); + const std::size_t max_nm = std::max(n, m); + + if (min_nm > 0 && (max_nm >= ((64 * 1024) / min_nm))) { + return gemm_batch_nm_contig_impl( + exec_q, lhs_tp, rhs_tp, res_tp, batch_nelems, n, k, m, depends); + } + + if (k == 0) { + sycl::event gemm_batch_no_reduction_ev = + exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + cgh.fill(res_tp, resTy(0), n * m * batch_nelems); + }); + return gemm_batch_no_reduction_ev; + } + + if (max_nm < 64) { + using dpctl::tensor::type_utils::is_complex; + if constexpr (!is_complex::value) { + if (m < 4) { + return gemm_batch_contig_tree_k_impl( + exec_q, lhs_tp, rhs_tp, res_tp, batch_nelems, n, k, m, + depends); + } + else { + return gemm_batch_contig_tree_k_impl( + exec_q, lhs_tp, rhs_tp, res_tp, batch_nelems, n, k, m, + depends); + } + } + else { + return gemm_batch_contig_tree_k_impl( + exec_q, lhs_tp, rhs_tp, res_tp, batch_nelems, n, k, m, depends); + } + } + else { // m > 1, n > k or m > k + using dpctl::tensor::type_utils::is_complex; + if constexpr (!is_complex::value) { + return gemm_batch_contig_tree_nm_impl( + exec_q, lhs_tp, rhs_tp, res_tp, batch_nelems, n, k, m, depends); + } + else { // m > 1, n > k or m > k, resTy complex + return gemm_batch_contig_tree_nm_impl( + exec_q, lhs_tp, rhs_tp, res_tp, batch_nelems, n, k, m, depends); + } + } +} + +// Gemm tree non-batched + +template +class gemm_tree_nm_krn; + +template +class gemm_tree_k_krn; + +template +sycl::event gemm_tree_k_impl(sycl::queue &exec_q, + const lhsTy *lhs_tp, + const rhsTy *rhs_tp, + resTy *res_tp, + std::size_t n, + std::size_t k, + std::size_t m, + int inner_nd, + int lhs_outer_nd, + const ssize_t *lhs_outer_inner_shapes_strides, + int rhs_outer_nd, + const ssize_t *rhs_outer_inner_shapes_strides, + int res_nd, + const ssize_t *res_shapes_strides, + const std::vector &depends) +{ + std::size_t delta_k(4); + std::size_t n_wi(64); + std::size_t delta_n(32); + + const sycl::device &dev = exec_q.get_device(); + const std::size_t local_mem_size = + dev.get_info(); + const std::size_t reserved_slm_size = 512; + + gemm_detail::scale_gemm_k_parameters( + local_mem_size, reserved_slm_size, delta_k, + n_wi, // modified by reference + delta_n // modified by reference + ); + + using BatchIndexerT = dpctl::tensor::offset_utils::ThreeZeroOffsets_Indexer; + static constexpr BatchIndexerT batch_indexer{}; + + static constexpr std::size_t single_batch_nelems = 1; + + using OuterInnerDimsIndexerT = dpctl::tensor::offset_utils::StridedIndexer; + const OuterInnerDimsIndexerT lhs_indexer(inner_nd + lhs_outer_nd, 0, + lhs_outer_inner_shapes_strides); + const OuterInnerDimsIndexerT rhs_indexer(inner_nd + rhs_outer_nd, 0, + rhs_outer_inner_shapes_strides); + + sycl::event gemm_ev; + if (k <= (delta_k * n_wi)) { + const OuterInnerDimsIndexerT res_indexer(res_nd, 0, res_shapes_strides); + + return gemm_detail::_gemm_tree_k_step< + lhsTy, rhsTy, resTy, BatchIndexerT, OuterInnerDimsIndexerT, + OuterInnerDimsIndexerT, OuterInnerDimsIndexerT, m_groups>( + exec_q, lhs_tp, rhs_tp, res_tp, single_batch_nelems, n, k, m, + delta_n, n_wi, delta_k, batch_indexer, lhs_indexer, rhs_indexer, + res_indexer, depends); + } + else { + using ReductionOpT = + typename std::conditional, + sycl::logical_or, + sycl::plus>::type; + static constexpr resTy identity_val = + sycl::known_identity::value; + + std::size_t iter_nelems = n * m; + std::size_t reduction_nelems = + (k + delta_k * n_wi - 1) / (delta_k * n_wi); + + // more than one work-groups is needed, requires a temporary + // delta_k * n_wi elements processed along k, so if more to + // process use multiple + const auto &sg_sizes = + dev.get_info(); + std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); + + static constexpr std::size_t preferred_reductions_per_wi = 8; + std::size_t reductions_per_wi(preferred_reductions_per_wi); + + std::size_t reduction_groups = + (reduction_nelems + preferred_reductions_per_wi * wg - 1) / + (preferred_reductions_per_wi * wg); + + std::size_t max_wg = reduction_detail::get_work_group_size(dev); + + if (reduction_nelems <= preferred_reductions_per_wi * max_wg) { + + auto tmp_owner = + dpctl::tensor::alloc_utils::smart_malloc_device( + iter_nelems * reduction_nelems, exec_q); + resTy *tmp = tmp_owner.get(); + + using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + static constexpr ResIndexerT res_indexer{}; + + sycl::event gemm_ev = gemm_detail::_gemm_tree_k_step< + lhsTy, rhsTy, resTy, BatchIndexerT, OuterInnerDimsIndexerT, + OuterInnerDimsIndexerT, ResIndexerT, m_groups>( + exec_q, lhs_tp, rhs_tp, tmp, single_batch_nelems, n, k, m, + delta_n, n_wi, delta_k, batch_indexer, lhs_indexer, rhs_indexer, + res_indexer, depends); + + sycl::event red_ev = single_reduction_for_gemm( + exec_q, tmp, res_tp, identity_val, iter_nelems, + reduction_nelems, reduction_groups, wg, max_wg, + preferred_reductions_per_wi, reductions_per_wi, res_nd, 0, + res_shapes_strides, {gemm_ev}); + + sycl::event cleanup_host_task_event = + dpctl::tensor::alloc_utils::async_smart_free(exec_q, {red_ev}, + tmp_owner); + return cleanup_host_task_event; + } + else { + assert(reduction_groups > 1); + + const std::size_t tmp_alloc_size = + iter_nelems * (/* temp */ reduction_nelems + + /* first reduction temp */ reduction_groups); + + auto tmp_owner = + dpctl::tensor::alloc_utils::smart_malloc_device( + tmp_alloc_size, exec_q); + + resTy *partially_reduced_tmp = tmp_owner.get(); + resTy *partially_reduced_tmp2 = + partially_reduced_tmp + reduction_nelems * iter_nelems; + + using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + static constexpr ResIndexerT res_indexer{}; + + sycl::event gemm_ev = gemm_detail::_gemm_tree_k_step< + lhsTy, rhsTy, resTy, BatchIndexerT, OuterInnerDimsIndexerT, + OuterInnerDimsIndexerT, ResIndexerT, m_groups>( + exec_q, lhs_tp, rhs_tp, partially_reduced_tmp, + single_batch_nelems, n, k, m, delta_n, n_wi, delta_k, + batch_indexer, lhs_indexer, rhs_indexer, res_indexer, depends); + + // tree_reduction_for_gemm returns sycl::event for reduction + sycl::event red_ev = tree_reduction_for_gemm( + exec_q, partially_reduced_tmp, partially_reduced_tmp2, res_tp, + identity_val, iter_nelems, reduction_nelems, reduction_groups, + wg, max_wg, preferred_reductions_per_wi, reductions_per_wi, + res_nd, 0, res_shapes_strides, {gemm_ev}); + + sycl::event cleanup_host_task_event = + dpctl::tensor::alloc_utils::async_smart_free(exec_q, {red_ev}, + tmp_owner); + + return cleanup_host_task_event; + } + } +} + +template +sycl::event gemm_tree_nm_impl(sycl::queue &exec_q, + const lhsTy *lhs_tp, + const rhsTy *rhs_tp, + resTy *res_tp, + std::size_t n, + std::size_t k, + std::size_t m, + int inner_nd, + int lhs_outer_nd, + const ssize_t *lhs_outer_inner_shapes_strides, + int rhs_outer_nd, + const ssize_t *rhs_outer_inner_shapes_strides, + int res_nd, + const ssize_t *res_shapes_strides, + const std::vector &depends) +{ + static constexpr int wi_delta_n = 2; + std::size_t wg_delta_n(16); // rows of A processed in WG + std::size_t wg_delta_m(16); // rows of B processed in WG + std::size_t wi_delta_k(64); // Elements in K dimension processed by WI + + const sycl::device &dev = exec_q.get_device(); + const std::size_t local_mem_size = + dev.get_info(); + const std::size_t reserved_slm_size = 512; + + gemm_detail::scale_gemm_nm_parameters( + local_mem_size, reserved_slm_size, wi_delta_n, + wi_delta_k, // modified by reference + wg_delta_n, // modified by reference + wg_delta_m // modified by reference + ); + + using BatchIndexerT = dpctl::tensor::offset_utils::ThreeZeroOffsets_Indexer; + static constexpr BatchIndexerT batch_indexer{}; + + static constexpr std::size_t single_batch_nelems = 1; + + using OuterInnerDimsIndexerT = dpctl::tensor::offset_utils::StridedIndexer; + const OuterInnerDimsIndexerT lhs_indexer(inner_nd + lhs_outer_nd, 0, + lhs_outer_inner_shapes_strides); + const OuterInnerDimsIndexerT rhs_indexer(inner_nd + rhs_outer_nd, 0, + rhs_outer_inner_shapes_strides); + + // each group processes delta_k items in a column, + // so no need to allocate temp memory if one group needed + if (k <= wi_delta_k) { + const OuterInnerDimsIndexerT res_indexer(res_nd, 0, res_shapes_strides); + + return gemm_detail::_gemm_tree_nm_step< + lhsTy, rhsTy, resTy, BatchIndexerT, OuterInnerDimsIndexerT, + OuterInnerDimsIndexerT, OuterInnerDimsIndexerT, wi_delta_n, + wi_delta_m>(exec_q, lhs_tp, rhs_tp, res_tp, single_batch_nelems, n, + k, m, wg_delta_n, wg_delta_m, wi_delta_k, batch_indexer, + lhs_indexer, rhs_indexer, res_indexer, depends); + } + else { + using ReductionOpT = + typename std::conditional, + sycl::logical_or, + sycl::plus>::type; + static constexpr resTy identity_val = + sycl::known_identity::value; + + std::size_t iter_nelems = n * m; + std::size_t reduction_nelems = (k + wi_delta_k - 1) / wi_delta_k; + + // more than one work-groups is needed, requires a temporary + // wi_delta_k elements processed along k, so if more to + // process use multiple + const auto &sg_sizes = + dev.get_info(); + std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); + + static constexpr std::size_t preferred_reductions_per_wi = 8; + std::size_t reductions_per_wi(preferred_reductions_per_wi); + + std::size_t reduction_groups = + (reduction_nelems + preferred_reductions_per_wi * wg - 1) / + (preferred_reductions_per_wi * wg); + + std::size_t max_wg = reduction_detail::get_work_group_size(dev); + + if (reduction_nelems <= preferred_reductions_per_wi * max_wg) { + auto tmp_owner = + dpctl::tensor::alloc_utils::smart_malloc_device( + iter_nelems * reduction_nelems, exec_q); + resTy *tmp = tmp_owner.get(); + + using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + static constexpr ResIndexerT res_indexer{}; + + sycl::event gemm_ev = gemm_detail::_gemm_tree_nm_step< + lhsTy, rhsTy, resTy, BatchIndexerT, OuterInnerDimsIndexerT, + OuterInnerDimsIndexerT, ResIndexerT, wi_delta_n, wi_delta_m>( + exec_q, lhs_tp, rhs_tp, tmp, single_batch_nelems, n, k, m, + wg_delta_n, wg_delta_m, wi_delta_k, batch_indexer, lhs_indexer, + rhs_indexer, res_indexer, depends); + + sycl::event red_ev = single_reduction_for_gemm( + exec_q, tmp, res_tp, identity_val, iter_nelems, + reduction_nelems, reduction_groups, wg, max_wg, + preferred_reductions_per_wi, reductions_per_wi, res_nd, 0, + res_shapes_strides, {gemm_ev}); + + sycl::event cleanup_host_task_event = + dpctl::tensor::alloc_utils::async_smart_free(exec_q, {red_ev}, + tmp_owner); + + return cleanup_host_task_event; + } + else { + assert(reduction_groups > 1); + + const std::size_t tmp_alloc_size = + iter_nelems * (/* temp */ reduction_nelems + + /* first reduction temp */ reduction_groups); + auto tmp_owner = + dpctl::tensor::alloc_utils::smart_malloc_device( + tmp_alloc_size, exec_q); + + resTy *partially_reduced_tmp = tmp_owner.get(); + resTy *partially_reduced_tmp2 = + partially_reduced_tmp + reduction_nelems * iter_nelems; + + using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + static constexpr ResIndexerT res_indexer{}; + + sycl::event gemm_ev = gemm_detail::_gemm_tree_nm_step< + lhsTy, rhsTy, resTy, BatchIndexerT, OuterInnerDimsIndexerT, + OuterInnerDimsIndexerT, ResIndexerT, wi_delta_n, wi_delta_m>( + exec_q, lhs_tp, rhs_tp, partially_reduced_tmp, + single_batch_nelems, n, k, m, wg_delta_n, wg_delta_m, + wi_delta_k, batch_indexer, lhs_indexer, rhs_indexer, + res_indexer, depends); + + sycl::event red_ev = tree_reduction_for_gemm( + exec_q, partially_reduced_tmp, partially_reduced_tmp2, res_tp, + identity_val, iter_nelems, reduction_nelems, reduction_groups, + wg, max_wg, preferred_reductions_per_wi, reductions_per_wi, + res_nd, 0, res_shapes_strides, {gemm_ev}); + + sycl::event cleanup_host_task_event = + dpctl::tensor::alloc_utils::async_smart_free(exec_q, {red_ev}, + tmp_owner); + + return cleanup_host_task_event; + } + } +} + +template +class gemm_tree_empty_krn; + +template +sycl::event gemm_tree_impl(sycl::queue &exec_q, + const char *lhs_cp, + const char *rhs_cp, + char *res_cp, + std::size_t n, + std::size_t k, + std::size_t m, + int inner_nd, + int lhs_outer_nd, + const ssize_t *lhs_outer_inner_shapes_strides, + int rhs_outer_nd, + const ssize_t *rhs_outer_inner_shapes_strides, + int res_nd, + const ssize_t *res_shapes_strides, + std::vector const &depends = {}) +{ + const lhsTy *lhs_tp = reinterpret_cast(lhs_cp); + const rhsTy *rhs_tp = reinterpret_cast(rhs_cp); + resTy *res_tp = reinterpret_cast(res_cp); + + const std::size_t min_nm = std::min(n, m); + const std::size_t max_nm = std::max(n, m); + + if (min_nm > 0 && (max_nm >= ((64 * 1024) / min_nm))) { + return gemm_nm_impl( + exec_q, lhs_tp, rhs_tp, res_tp, n, k, m, inner_nd, lhs_outer_nd, + lhs_outer_inner_shapes_strides, rhs_outer_nd, + rhs_outer_inner_shapes_strides, res_nd, res_shapes_strides, + depends); + } + + if (k == 0) { + sycl::event gemm_no_reduction_ev = + exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + using IndexerT = dpctl::tensor::offset_utils::StridedIndexer; + const IndexerT res_indexer(res_nd, 0, res_shapes_strides); + using InitKernelName = + class gemm_tree_empty_krn; + cgh.parallel_for( + sycl::range<1>(n * m), [=](sycl::id<1> id) { + auto res_offset = res_indexer(id[0]); + res_tp[res_offset] = resTy(0); + }); + }); + return gemm_no_reduction_ev; + } + + if (max_nm < 64) { + using dpctl::tensor::type_utils::is_complex; + if constexpr (!is_complex::value) { + if (m < 4) { + return gemm_tree_k_impl( + exec_q, lhs_tp, rhs_tp, res_tp, n, k, m, inner_nd, + lhs_outer_nd, lhs_outer_inner_shapes_strides, rhs_outer_nd, + rhs_outer_inner_shapes_strides, res_nd, res_shapes_strides, + depends); + } + else { + return gemm_tree_k_impl( + exec_q, lhs_tp, rhs_tp, res_tp, n, k, m, inner_nd, + lhs_outer_nd, lhs_outer_inner_shapes_strides, rhs_outer_nd, + rhs_outer_inner_shapes_strides, res_nd, res_shapes_strides, + depends); + } + } + else { + return gemm_tree_k_impl( + exec_q, lhs_tp, rhs_tp, res_tp, n, k, m, inner_nd, lhs_outer_nd, + lhs_outer_inner_shapes_strides, rhs_outer_nd, + rhs_outer_inner_shapes_strides, res_nd, res_shapes_strides, + depends); + } + } + else { // m > 1, n > k or m > k + using dpctl::tensor::type_utils::is_complex; + if constexpr (!is_complex::value) { + return gemm_tree_nm_impl( + exec_q, lhs_tp, rhs_tp, res_tp, n, k, m, inner_nd, lhs_outer_nd, + lhs_outer_inner_shapes_strides, rhs_outer_nd, + rhs_outer_inner_shapes_strides, res_nd, res_shapes_strides, + depends); + } + else { + return gemm_tree_nm_impl( + exec_q, lhs_tp, rhs_tp, res_tp, n, k, m, inner_nd, lhs_outer_nd, + lhs_outer_inner_shapes_strides, rhs_outer_nd, + rhs_outer_inner_shapes_strides, res_nd, res_shapes_strides, + depends); + } + } +} + +template +sycl::event gemm_contig_tree_k_impl(sycl::queue &exec_q, + const lhsTy *lhs_tp, + const rhsTy *rhs_tp, + resTy *res_tp, + std::size_t n, + std::size_t k, + std::size_t m, + std::vector const &depends) +{ + std::size_t delta_k(4); + std::size_t n_wi(64); + std::size_t delta_n(32); + + const sycl::device &dev = exec_q.get_device(); + const std::size_t local_mem_size = + dev.get_info(); + const std::size_t reserved_slm_size = 512; + + gemm_detail::scale_gemm_k_parameters( + local_mem_size, reserved_slm_size, delta_k, + n_wi, // modified by reference + delta_n // modified by reference + ); + + using OuterInnerDimsIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + static constexpr OuterInnerDimsIndexerT lhs_indexer{}; + static constexpr OuterInnerDimsIndexerT rhs_indexer{}; + static constexpr OuterInnerDimsIndexerT res_indexer{}; + + using BatchIndexerT = dpctl::tensor::offset_utils::ThreeZeroOffsets_Indexer; + static constexpr BatchIndexerT batch_indexer{}; + + static constexpr std::size_t single_batch_nelems = 1; + + sycl::event gemm_ev; + if (k <= (delta_k * n_wi)) { + return gemm_detail::_gemm_tree_k_step< + lhsTy, rhsTy, resTy, BatchIndexerT, OuterInnerDimsIndexerT, + OuterInnerDimsIndexerT, OuterInnerDimsIndexerT, m_groups>( + exec_q, lhs_tp, rhs_tp, res_tp, single_batch_nelems, n, k, m, + delta_n, n_wi, delta_k, batch_indexer, lhs_indexer, rhs_indexer, + res_indexer, depends); + } + else { + using ReductionOpT = + typename std::conditional, + sycl::logical_or, + sycl::plus>::type; + static constexpr resTy identity_val = + sycl::known_identity::value; + + std::size_t iter_nelems = n * m; + std::size_t reduction_nelems = + (k + delta_k * n_wi - 1) / (delta_k * n_wi); + + // more than one work-groups is needed, requires a + // temporary delta_k * n_wi elements processed along k, + // so if more to process use multiple + const auto &sg_sizes = + dev.get_info(); + std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); + + static constexpr std::size_t preferred_reductions_per_wi = 8; + std::size_t reductions_per_wi(preferred_reductions_per_wi); + + std::size_t reduction_groups = + (reduction_nelems + preferred_reductions_per_wi * wg - 1) / + (preferred_reductions_per_wi * wg); + + std::size_t max_wg = reduction_detail::get_work_group_size(dev); + + if (reduction_nelems <= preferred_reductions_per_wi * max_wg) { + auto tmp_owner = + dpctl::tensor::alloc_utils::smart_malloc_device( + iter_nelems * reduction_nelems, exec_q); + resTy *tmp = tmp_owner.get(); + + sycl::event gemm_ev = gemm_detail::_gemm_tree_k_step< + lhsTy, rhsTy, resTy, BatchIndexerT, OuterInnerDimsIndexerT, + OuterInnerDimsIndexerT, OuterInnerDimsIndexerT, m_groups>( + exec_q, lhs_tp, rhs_tp, tmp, single_batch_nelems, n, k, m, + delta_n, n_wi, delta_k, batch_indexer, lhs_indexer, rhs_indexer, + res_indexer, depends); + + sycl::event red_ev = + single_reduction_for_gemm_contig( + exec_q, tmp, res_tp, identity_val, iter_nelems, + reduction_nelems, reduction_groups, wg, max_wg, + preferred_reductions_per_wi, reductions_per_wi, {gemm_ev}); + + sycl::event cleanup_host_task_event = + dpctl::tensor::alloc_utils::async_smart_free(exec_q, {red_ev}, + tmp_owner); + return cleanup_host_task_event; + } + else { + assert(reduction_groups > 1); + + const std::size_t tmp_alloc_size = + iter_nelems * (/* temp */ reduction_nelems + + /* first reduction temp */ reduction_groups); + auto tmp_owner = + dpctl::tensor::alloc_utils::smart_malloc_device( + tmp_alloc_size, exec_q); + + resTy *partially_reduced_tmp = tmp_owner.get(); + resTy *partially_reduced_tmp2 = + partially_reduced_tmp + reduction_nelems * iter_nelems; + + sycl::event gemm_ev = gemm_detail::_gemm_tree_k_step< + lhsTy, rhsTy, resTy, BatchIndexerT, OuterInnerDimsIndexerT, + OuterInnerDimsIndexerT, OuterInnerDimsIndexerT, m_groups>( + exec_q, lhs_tp, rhs_tp, partially_reduced_tmp, + single_batch_nelems, n, k, m, delta_n, n_wi, delta_k, + batch_indexer, lhs_indexer, rhs_indexer, res_indexer, depends); + + // tree_reduction_for_gemm_contig returns sycl::event + // for reduction + sycl::event red_ev = + tree_reduction_for_gemm_contig( + exec_q, partially_reduced_tmp, partially_reduced_tmp2, + res_tp, identity_val, iter_nelems, reduction_nelems, + reduction_groups, wg, max_wg, preferred_reductions_per_wi, + reductions_per_wi, {gemm_ev}); + + sycl::event cleanup_host_task_event = + dpctl::tensor::alloc_utils::async_smart_free(exec_q, {red_ev}, + tmp_owner); + + return cleanup_host_task_event; + } + } +} + +template +sycl::event gemm_contig_tree_nm_impl(sycl::queue &exec_q, + const lhsTy *lhs_tp, + const rhsTy *rhs_tp, + resTy *res_tp, + std::size_t n, + std::size_t k, + std::size_t m, + std::vector const &depends) +{ + static constexpr int wi_delta_n = 2; + std::size_t wg_delta_n(16); // rows of A processed in WG + std::size_t wg_delta_m(16); // rows of B processed in WG + std::size_t wi_delta_k(64); // Elements in K dimension processed by WI + + const sycl::device &dev = exec_q.get_device(); + const std::size_t local_mem_size = + dev.get_info(); + const std::size_t reserved_slm_size = 512; + + gemm_detail::scale_gemm_nm_parameters( + local_mem_size, reserved_slm_size, wi_delta_n, + wi_delta_k, // modified by reference + wg_delta_n, // modified by reference + wg_delta_m // modified by reference + ); + + using OuterInnerDimsIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + static constexpr OuterInnerDimsIndexerT lhs_indexer{}; + static constexpr OuterInnerDimsIndexerT rhs_indexer{}; + static constexpr OuterInnerDimsIndexerT res_indexer{}; + + using BatchIndexerT = dpctl::tensor::offset_utils::ThreeZeroOffsets_Indexer; + static constexpr BatchIndexerT batch_indexer{}; + + static constexpr std::size_t single_batch_nelems = 1; + + // each group processes delta_k items in a column, + // so no need to allocate temp memory if one group needed + if (k <= wi_delta_k) { + + return gemm_detail::_gemm_tree_nm_step< + lhsTy, rhsTy, resTy, BatchIndexerT, OuterInnerDimsIndexerT, + OuterInnerDimsIndexerT, OuterInnerDimsIndexerT, wi_delta_n, + wi_delta_m>(exec_q, lhs_tp, rhs_tp, res_tp, single_batch_nelems, n, + k, m, wg_delta_n, wg_delta_m, wi_delta_k, batch_indexer, + lhs_indexer, rhs_indexer, res_indexer, depends); + } + else { + using ReductionOpT = + typename std::conditional, + sycl::logical_or, + sycl::plus>::type; + static constexpr resTy identity_val = + sycl::known_identity::value; + + std::size_t iter_nelems = n * m; + std::size_t reduction_nelems = (k + wi_delta_k - 1) / wi_delta_k; + + // more than one work-groups is needed, requires a temporary + // wi_delta_k elements processed along k, so if more to + // process use multiple + const auto &sg_sizes = + dev.get_info(); + std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); + + static constexpr std::size_t preferred_reductions_per_wi = 8; + std::size_t reductions_per_wi(preferred_reductions_per_wi); + + std::size_t reduction_groups = + (reduction_nelems + preferred_reductions_per_wi * wg - 1) / + (preferred_reductions_per_wi * wg); + + std::size_t max_wg = reduction_detail::get_work_group_size(dev); + + if (reduction_nelems <= preferred_reductions_per_wi * max_wg) { + auto tmp_owner = + dpctl::tensor::alloc_utils::smart_malloc_device( + iter_nelems * reduction_nelems, exec_q); + resTy *tmp = tmp_owner.get(); + + sycl::event gemm_ev = gemm_detail::_gemm_tree_nm_step< + lhsTy, rhsTy, resTy, BatchIndexerT, OuterInnerDimsIndexerT, + OuterInnerDimsIndexerT, OuterInnerDimsIndexerT, wi_delta_n, + wi_delta_m>(exec_q, lhs_tp, rhs_tp, tmp, single_batch_nelems, n, + k, m, wg_delta_n, wg_delta_m, wi_delta_k, + batch_indexer, lhs_indexer, rhs_indexer, + res_indexer, depends); + + sycl::event red_ev = + single_reduction_for_gemm_contig( + exec_q, tmp, res_tp, identity_val, iter_nelems, + reduction_nelems, reduction_groups, wg, max_wg, + preferred_reductions_per_wi, reductions_per_wi, {gemm_ev}); + + sycl::event cleanup_host_task_event = + dpctl::tensor::alloc_utils::async_smart_free(exec_q, {red_ev}, + tmp_owner); + return cleanup_host_task_event; + } + else { + assert(reduction_groups > 1); + + const std::size_t tmp_alloc_size = + iter_nelems * (/* temp */ reduction_nelems + + /* first reduction temp */ reduction_groups); + + auto tmp_owner = + dpctl::tensor::alloc_utils::smart_malloc_device( + tmp_alloc_size, exec_q); + resTy *partially_reduced_tmp = tmp_owner.get(); + resTy *partially_reduced_tmp2 = + partially_reduced_tmp + reduction_nelems * iter_nelems; + + sycl::event gemm_ev = gemm_detail::_gemm_tree_nm_step< + lhsTy, rhsTy, resTy, BatchIndexerT, OuterInnerDimsIndexerT, + OuterInnerDimsIndexerT, OuterInnerDimsIndexerT, wi_delta_n, + wi_delta_m>(exec_q, lhs_tp, rhs_tp, partially_reduced_tmp, + single_batch_nelems, n, k, m, wg_delta_n, + wg_delta_m, wi_delta_k, batch_indexer, lhs_indexer, + rhs_indexer, res_indexer, depends); + + sycl::event red_ev = + tree_reduction_for_gemm_contig( + exec_q, partially_reduced_tmp, partially_reduced_tmp2, + res_tp, identity_val, iter_nelems, reduction_nelems, + reduction_groups, wg, max_wg, preferred_reductions_per_wi, + reductions_per_wi, {gemm_ev}); + + sycl::event cleanup_host_task_event = + dpctl::tensor::alloc_utils::async_smart_free(exec_q, {red_ev}, + tmp_owner); + + return cleanup_host_task_event; + } + } +} + +template +sycl::event gemm_contig_tree_impl(sycl::queue &exec_q, + const char *lhs_cp, + const char *rhs_cp, + char *res_cp, + std::size_t n, + std::size_t k, + std::size_t m, + std::vector const &depends = {}) +{ + const lhsTy *lhs_tp = reinterpret_cast(lhs_cp); + const rhsTy *rhs_tp = reinterpret_cast(rhs_cp); + resTy *res_tp = reinterpret_cast(res_cp); + + const std::size_t min_nm = std::min(n, m); + const std::size_t max_nm = std::max(n, m); + + if (min_nm > 0 && (max_nm >= ((64 * 1024) / min_nm))) { + static constexpr std::size_t single_batch_nelems = 1; + return gemm_batch_nm_contig_impl( + exec_q, lhs_tp, rhs_tp, res_tp, single_batch_nelems, n, k, m, + depends); + } + + if (k == 0) { + sycl::event gemm_no_reduction_ev = + exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + cgh.fill(res_tp, resTy(0), n * m); + }); + return gemm_no_reduction_ev; + } + + if (max_nm < 64) { + using dpctl::tensor::type_utils::is_complex; + if constexpr (!is_complex::value) { + if (m < 4) { + return gemm_contig_tree_k_impl( + exec_q, lhs_tp, rhs_tp, res_tp, n, k, m, depends); + } + else { + return gemm_contig_tree_k_impl( + exec_q, lhs_tp, rhs_tp, res_tp, n, k, m, depends); + } + } + else { + return gemm_contig_tree_k_impl( + exec_q, lhs_tp, rhs_tp, res_tp, n, k, m, depends); + } + } + else { // m > 1, n > k or m > k + using dpctl::tensor::type_utils::is_complex; + if constexpr (!is_complex::value) { + return gemm_contig_tree_nm_impl( + exec_q, lhs_tp, rhs_tp, res_tp, n, k, m, depends); + } + else { + return gemm_contig_tree_nm_impl( + exec_q, lhs_tp, rhs_tp, res_tp, n, k, m, depends); + } + } +} + +} // namespace dpctl::tensor::kernels diff --git a/dpnp/tensor/libtensor/include/kernels/reductions.hpp b/dpnp/tensor/libtensor/include/kernels/reductions.hpp new file mode 100644 index 000000000000..75df2c201968 --- /dev/null +++ b/dpnp/tensor/libtensor/include/kernels/reductions.hpp @@ -0,0 +1,3313 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for tensor reduction along axis. +//===----------------------------------------------------------------------===// + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "dpctl_tensor_types.hpp" +#include "utils/math_utils.hpp" +#include "utils/offset_utils.hpp" +#include "utils/sycl_alloc_utils.hpp" +#include "utils/sycl_utils.hpp" +#include "utils/type_utils.hpp" + +namespace dpctl::tensor::kernels +{ + +using dpctl::tensor::ssize_t; +namespace su_ns = dpctl::tensor::sycl_utils; + +namespace reduction_detail +{ + +inline std::size_t get_work_group_size(const sycl::device &d) +{ + // prevents running out of resources on CPU + return std::min( + 2048, d.get_info() / 2); +} + +} // namespace reduction_detail + +template +struct needs_workaround +{ + static constexpr bool value = + (std::is_same_v> && + (std::is_same_v || + std::is_same_v)) || + (__LIBSYCL_MAJOR_VERSION < 7 && std::is_same_v && + std::is_same_v>); +}; + +template +struct can_use_reduce_over_group +{ + static constexpr bool value = + sycl::has_known_identity::value && + !needs_workaround::value; +}; + +template +struct SequentialReduction +{ +private: + const argT *inp_ = nullptr; + outT *out_ = nullptr; + ReductionOp reduction_op_; + outT identity_; + InputOutputIterIndexerT inp_out_iter_indexer_; + InputRedIndexerT inp_reduced_dims_indexer_; + std::size_t reduction_max_gid_ = 0; + +public: + SequentialReduction(const argT *inp, + outT *res, + const ReductionOp &reduction_op, + const outT &identity_val, + const InputOutputIterIndexerT &arg_res_iter_indexer, + const InputRedIndexerT &arg_reduced_dims_indexer, + std::size_t reduction_size) + : inp_(inp), out_(res), reduction_op_(reduction_op), + identity_(identity_val), inp_out_iter_indexer_(arg_res_iter_indexer), + inp_reduced_dims_indexer_(arg_reduced_dims_indexer), + reduction_max_gid_(reduction_size) + { + } + + void operator()(sycl::id<1> id) const + { + + auto const &inp_out_iter_offsets_ = inp_out_iter_indexer_(id[0]); + const ssize_t &inp_iter_offset = + inp_out_iter_offsets_.get_first_offset(); + const ssize_t &out_iter_offset = + inp_out_iter_offsets_.get_second_offset(); + + outT red_val(identity_); + for (std::size_t m = 0; m < reduction_max_gid_; ++m) { + const ssize_t inp_reduction_offset = inp_reduced_dims_indexer_(m); + const ssize_t inp_offset = inp_iter_offset + inp_reduction_offset; + + using dpctl::tensor::type_utils::convert_impl; + outT val; + if constexpr (su_ns::IsLogicalAnd::value || + su_ns::IsLogicalOr::value) { + val = convert_impl(inp_[inp_offset]); + } + else { + val = convert_impl(inp_[inp_offset]); + } + red_val = reduction_op_(red_val, val); + } + + out_[out_iter_offset] = red_val; + } +}; + +/* === Reduction, using sycl::reduce_over_group, and sycl::atomic_ref === */ + +/* + This kernel only works for outT with sizeof(outT) == 4, or sizeof(outT) == 8 + if the device has aspect atomic64 and only with those supported by + sycl::atomic_ref +*/ +template +struct ReductionOverGroupWithAtomicFunctor +{ +private: + const argT *inp_ = nullptr; + outT *out_ = nullptr; + ReductionOp reduction_op_; + outT identity_; + InputOutputIterIndexerT inp_out_iter_indexer_; + InputRedIndexerT inp_reduced_dims_indexer_; + std::size_t reduction_max_gid_ = 0; + std::size_t iter_gws_ = 1; + std::size_t reductions_per_wi = 16; + +public: + ReductionOverGroupWithAtomicFunctor( + const argT *data, + outT *res, + const ReductionOp &reduction_op, + const outT &identity_val, + const InputOutputIterIndexerT &arg_res_iter_indexer, + const InputRedIndexerT &arg_reduced_dims_indexer, + std::size_t reduction_size, + std::size_t iteration_size, + std::size_t reduction_size_per_wi) + : inp_(data), out_(res), reduction_op_(reduction_op), + identity_(identity_val), inp_out_iter_indexer_(arg_res_iter_indexer), + inp_reduced_dims_indexer_(arg_reduced_dims_indexer), + reduction_max_gid_(reduction_size), iter_gws_(iteration_size), + reductions_per_wi(reduction_size_per_wi) + { + } + + void operator()(sycl::nd_item<1> it) const + { + const std::size_t iter_gid = it.get_group(0) % iter_gws_; + const std::size_t reduction_batch_id = it.get_group(0) / iter_gws_; + + const std::size_t reduction_lid = it.get_local_id(0); + const std::size_t wg = + it.get_local_range(0); // 0 <= reduction_lid < wg + + // work-items operate over input with indices + // inp_data_id = reduction_batch_id * wg * reductions_per_wi + m * wg + // + reduction_lid + // for 0 <= m < reductions_per_wi + + const auto &inp_out_iter_offsets_ = inp_out_iter_indexer_(iter_gid); + const auto &inp_iter_offset = inp_out_iter_offsets_.get_first_offset(); + const auto &out_iter_offset = inp_out_iter_offsets_.get_second_offset(); + + outT local_red_val(identity_); + std::size_t arg_reduce_gid0 = + reduction_lid + reduction_batch_id * wg * reductions_per_wi; + std::size_t arg_reduce_gid_max = std::min( + reduction_max_gid_, arg_reduce_gid0 + reductions_per_wi * wg); + + for (std::size_t arg_reduce_gid = arg_reduce_gid0; + arg_reduce_gid < arg_reduce_gid_max; arg_reduce_gid += wg) { + auto inp_reduction_offset = + inp_reduced_dims_indexer_(arg_reduce_gid); + auto inp_offset = inp_iter_offset + inp_reduction_offset; + + using dpctl::tensor::type_utils::convert_impl; + outT val; + if constexpr (su_ns::IsLogicalAnd::value || + su_ns::IsLogicalOr::value) { + // handle nans + val = convert_impl(inp_[inp_offset]); + } + else { + val = convert_impl(inp_[inp_offset]); + } + + local_red_val = reduction_op_(local_red_val, val); + } + + auto work_group = it.get_group(); + // This only works if reduction_op_ is from small set of operators + outT red_val_over_wg; + if constexpr (su_ns::IsLogicalAnd::value) { + red_val_over_wg = static_cast( + sycl::all_of_group(work_group, local_red_val)); + } + else if constexpr (su_ns::IsLogicalOr::value) { + red_val_over_wg = static_cast( + sycl::any_of_group(work_group, local_red_val)); + } + else { + red_val_over_wg = sycl::reduce_over_group(work_group, local_red_val, + identity_, reduction_op_); + } + + if (work_group.leader()) { + sycl::atomic_ref + res_ref(out_[out_iter_offset]); + if constexpr (su_ns::IsPlus::value) { + res_ref += red_val_over_wg; + } + else if constexpr (su_ns::IsMaximum::value) { + res_ref.fetch_max(red_val_over_wg); + } + else if constexpr (su_ns::IsMinimum::value) { + res_ref.fetch_min(red_val_over_wg); + } + else if constexpr (su_ns::IsLogicalAnd::value) { + res_ref.fetch_and(red_val_over_wg); + } + else if constexpr (su_ns::IsLogicalOr::value) { + res_ref.fetch_or(red_val_over_wg); + } + else { + outT read_val = res_ref.load(); + outT new_val{}; + do { + new_val = reduction_op_(read_val, red_val_over_wg); + } while (!res_ref.compare_exchange_strong(read_val, new_val)); + } + } + } +}; + +/* === Reduction, using custom_reduce_over_group, and sycl::atomic_ref === */ + +template +struct CustomReductionOverGroupWithAtomicFunctor +{ +private: + const argT *inp_ = nullptr; + outT *out_ = nullptr; + ReductionOp reduction_op_; + outT identity_; + InputOutputIterIndexerT inp_out_iter_indexer_; + InputRedIndexerT inp_reduced_dims_indexer_; + SlmT local_mem_; + std::size_t reduction_max_gid_ = 0; + std::size_t iter_gws_ = 1; + std::size_t reductions_per_wi = 16; + +public: + CustomReductionOverGroupWithAtomicFunctor( + const argT *data, + outT *res, + const ReductionOp &reduction_op, + const outT &identity_val, + const InputOutputIterIndexerT &arg_res_iter_indexer, + const InputRedIndexerT &arg_reduced_dims_indexer, + SlmT local_mem, + std::size_t reduction_size, + std::size_t iteration_size, + std::size_t reduction_size_per_wi) + : inp_(data), out_(res), reduction_op_(reduction_op), + identity_(identity_val), inp_out_iter_indexer_(arg_res_iter_indexer), + inp_reduced_dims_indexer_(arg_reduced_dims_indexer), + local_mem_(local_mem), reduction_max_gid_(reduction_size), + iter_gws_(iteration_size), reductions_per_wi(reduction_size_per_wi) + { + } + + void operator()(sycl::nd_item<1> it) const + { + const std::size_t iter_gid = it.get_group(0) % iter_gws_; + const std::size_t reduction_batch_id = it.get_group(0) / iter_gws_; + + const std::size_t reduction_lid = it.get_local_id(0); + const std::size_t wg = + it.get_local_range(0); // 0 <= reduction_lid < wg + + // work-items operate over input with indices + // inp_data_id = reduction_batch_id * wg * reductions_per_wi + m * wg + // + reduction_lid + // for 0 <= m < reductions_per_wi + + const auto &inp_out_iter_offsets_ = inp_out_iter_indexer_(iter_gid); + const auto &inp_iter_offset = inp_out_iter_offsets_.get_first_offset(); + const auto &out_iter_offset = inp_out_iter_offsets_.get_second_offset(); + + outT local_red_val(identity_); + std::size_t arg_reduce_gid0 = + reduction_lid + reduction_batch_id * wg * reductions_per_wi; + std::size_t arg_reduce_gid_max = std::min( + reduction_max_gid_, arg_reduce_gid0 + reductions_per_wi * wg); + + for (std::size_t arg_reduce_gid = arg_reduce_gid0; + arg_reduce_gid < arg_reduce_gid_max; arg_reduce_gid += wg) { + auto inp_reduction_offset = + inp_reduced_dims_indexer_(arg_reduce_gid); + auto inp_offset = inp_iter_offset + inp_reduction_offset; + + using dpctl::tensor::type_utils::convert_impl; + outT val; + if constexpr (su_ns::IsLogicalAnd::value || + su_ns::IsLogicalOr::value) { + // handle nans + val = convert_impl(inp_[inp_offset]); + } + else { + val = convert_impl(inp_[inp_offset]); + } + + local_red_val = reduction_op_(local_red_val, val); + } + + auto work_group = it.get_group(); + outT red_val_over_wg = su_ns::custom_reduce_over_group( + work_group, local_mem_, local_red_val, reduction_op_); + + if (work_group.leader()) { + sycl::atomic_ref + res_ref(out_[out_iter_offset]); + // retain these checks in case a reduce_over_group work-around is + // needed + if constexpr (su_ns::IsSyclPlus::value) { + res_ref += red_val_over_wg; + } + else if constexpr (su_ns::IsSyclMaximum::value) { + res_ref.fetch_max(red_val_over_wg); + } + else if constexpr (su_ns::IsSyclMinimum::value) { + res_ref.fetch_min(red_val_over_wg); + } + else if constexpr (su_ns::IsSyclLogicalAnd::value) { + res_ref.fetch_and(red_val_over_wg); + } + else if constexpr (su_ns::IsSyclLogicalOr::value) { + res_ref.fetch_or(red_val_over_wg); + } + else { + outT read_val = res_ref.load(); + outT new_val{}; + do { + new_val = reduction_op_(read_val, red_val_over_wg); + } while (!res_ref.compare_exchange_strong(read_val, new_val)); + } + } + } +}; + +template +struct ReductionOverGroupNoAtomicFunctor +{ +private: + const argT *inp_ = nullptr; + outT *out_ = nullptr; + ReductionOp reduction_op_; + outT identity_; + InputOutputIterIndexerT inp_out_iter_indexer_; + InputRedIndexerT inp_reduced_dims_indexer_; + std::size_t reduction_max_gid_ = 0; + std::size_t iter_gws_ = 1; + std::size_t reductions_per_wi = 16; + +public: + ReductionOverGroupNoAtomicFunctor( + const argT *data, + outT *res, + const ReductionOp &reduction_op, + const outT &identity_val, + const InputOutputIterIndexerT &arg_res_iter_indexer, + const InputRedIndexerT &arg_reduced_dims_indexer, + std::size_t reduction_size, + std::size_t iteration_size, + std::size_t reduction_size_per_wi) + : inp_(data), out_(res), reduction_op_(reduction_op), + identity_(identity_val), inp_out_iter_indexer_(arg_res_iter_indexer), + inp_reduced_dims_indexer_(arg_reduced_dims_indexer), + reduction_max_gid_(reduction_size), iter_gws_(iteration_size), + reductions_per_wi(reduction_size_per_wi) + { + } + + void operator()(sycl::nd_item<1> it) const + { + const std::size_t reduction_lid = it.get_local_id(0); + const std::size_t wg = + it.get_local_range(0); // 0 <= reduction_lid < wg + + const std::size_t iter_gid = it.get_group(0) % iter_gws_; + const std::size_t reduction_batch_id = it.get_group(0) / iter_gws_; + const std::size_t n_reduction_groups = + it.get_group_range(0) / iter_gws_; + + // work-items operates over input with indices + // inp_data_id = reduction_batch_id * wg * reductions_per_wi + m * wg + // + reduction_lid + // for 0 <= m < reductions_per_wi + + const auto &inp_out_iter_offsets_ = inp_out_iter_indexer_(iter_gid); + const auto &inp_iter_offset = inp_out_iter_offsets_.get_first_offset(); + const auto &out_iter_offset = inp_out_iter_offsets_.get_second_offset(); + + outT local_red_val(identity_); + std::size_t arg_reduce_gid0 = + reduction_lid + reduction_batch_id * wg * reductions_per_wi; + for (std::size_t m = 0; m < reductions_per_wi; ++m) { + std::size_t arg_reduce_gid = arg_reduce_gid0 + m * wg; + + if (arg_reduce_gid < reduction_max_gid_) { + auto inp_reduction_offset = + inp_reduced_dims_indexer_(arg_reduce_gid); + auto inp_offset = inp_iter_offset + inp_reduction_offset; + + using dpctl::tensor::type_utils::convert_impl; + outT val; + if constexpr (su_ns::IsLogicalAnd::value || + su_ns::IsLogicalOr::value) { + // handle nans + val = convert_impl(inp_[inp_offset]); + } + else { + val = convert_impl(inp_[inp_offset]); + } + + local_red_val = reduction_op_(local_red_val, val); + } + } + + auto work_group = it.get_group(); + // This only works if reduction_op_ is from small set of operators + outT red_val_over_wg; + if constexpr (su_ns::IsLogicalAnd::value) { + red_val_over_wg = sycl::all_of_group(work_group, local_red_val); + } + else if constexpr (su_ns::IsLogicalOr::value) { + red_val_over_wg = sycl::any_of_group(work_group, local_red_val); + } + else { + red_val_over_wg = sycl::reduce_over_group(work_group, local_red_val, + identity_, reduction_op_); + } + + if (work_group.leader()) { + // each group writes to a different memory location + out_[out_iter_offset * n_reduction_groups + reduction_batch_id] = + red_val_over_wg; + } + } +}; + +/* = Reduction, using custom_reduce_over_group and not using atomic_ref*/ + +template +struct CustomReductionOverGroupNoAtomicFunctor +{ +private: + const argT *inp_ = nullptr; + outT *out_ = nullptr; + ReductionOp reduction_op_; + outT identity_; + InputOutputIterIndexerT inp_out_iter_indexer_; + InputRedIndexerT inp_reduced_dims_indexer_; + SlmT local_mem_; + std::size_t reduction_max_gid_ = 0; + std::size_t iter_gws_ = 1; + std::size_t reductions_per_wi = 16; + +public: + CustomReductionOverGroupNoAtomicFunctor( + const argT *data, + outT *res, + const ReductionOp &reduction_op, + const outT &identity_val, + const InputOutputIterIndexerT &arg_res_iter_indexer, + const InputRedIndexerT &arg_reduced_dims_indexer, + SlmT local_mem, + std::size_t reduction_size, + std::size_t iteration_size, + std::size_t reduction_size_per_wi) + : inp_(data), out_(res), reduction_op_(reduction_op), + identity_(identity_val), inp_out_iter_indexer_(arg_res_iter_indexer), + inp_reduced_dims_indexer_(arg_reduced_dims_indexer), + local_mem_(local_mem), reduction_max_gid_(reduction_size), + iter_gws_(iteration_size), reductions_per_wi(reduction_size_per_wi) + { + } + + void operator()(sycl::nd_item<1> it) const + { + const std::size_t reduction_lid = it.get_local_id(0); + const std::size_t wg = + it.get_local_range(0); // 0 <= reduction_lid < wg + + const std::size_t iter_gid = it.get_group(0) % iter_gws_; + const std::size_t reduction_batch_id = it.get_group(0) / iter_gws_; + const std::size_t n_reduction_groups = + it.get_group_range(0) / iter_gws_; + + // work-items operates over input with indices + // inp_data_id = reduction_batch_id * wg * reductions_per_wi + m * wg + // + reduction_lid + // for 0 <= m < reductions_per_wi + + auto inp_out_iter_offsets_ = inp_out_iter_indexer_(iter_gid); + const auto &inp_iter_offset = inp_out_iter_offsets_.get_first_offset(); + const auto &out_iter_offset = inp_out_iter_offsets_.get_second_offset(); + + outT local_red_val(identity_); + std::size_t arg_reduce_gid0 = + reduction_lid + reduction_batch_id * wg * reductions_per_wi; + for (std::size_t m = 0; m < reductions_per_wi; ++m) { + std::size_t arg_reduce_gid = arg_reduce_gid0 + m * wg; + + if (arg_reduce_gid < reduction_max_gid_) { + auto inp_reduction_offset = + inp_reduced_dims_indexer_(arg_reduce_gid); + auto inp_offset = inp_iter_offset + inp_reduction_offset; + + using dpctl::tensor::type_utils::convert_impl; + outT val; + if constexpr (std::is_same_v> || + std::is_same_v>) { + // handle nans + val = convert_impl(inp_[inp_offset]); + } + else { + val = convert_impl(inp_[inp_offset]); + } + + local_red_val = reduction_op_(local_red_val, val); + } + } + + auto work_group = it.get_group(); + // This only works if reduction_op_ is from small set of operators + outT red_val_over_wg = su_ns::custom_reduce_over_group( + work_group, local_mem_, local_red_val, reduction_op_); + + if (work_group.leader()) { + // each group writes to a different memory location + out_[out_iter_offset * n_reduction_groups + reduction_batch_id] = + red_val_over_wg; + } + } +}; + +template class kernel_name_token> +sycl::event + sequential_reduction(sycl::queue &exec_q, + const argTy *arg, + resTy *res, + resTy identity_val, + std::size_t iter_nelems, + std::size_t reduction_nelems, + const InputOutputIterIndexerT &in_out_iter_indexer, + const ReductionIndexerT &reduction_indexer, + const std::vector &depends) +{ + sycl::event red_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + using KernelName = + class kernel_name_token; + + cgh.parallel_for( + sycl::range<1>(iter_nelems), + SequentialReduction( + arg, res, ReductionOpT(), identity_val, in_out_iter_indexer, + reduction_indexer, reduction_nelems)); + }); + + return red_ev; +} + +template +class custom_reduction_wrapper; + +template class kernel_name_token> +sycl::event + submit_atomic_reduction(sycl::queue &exec_q, + const argTy *arg, + resTy *res, + resTy identity_val, + std::size_t wg, + std::size_t iter_nelems, + std::size_t reduction_nelems, + std::size_t reductions_per_wi, + std::size_t reduction_groups, + const InputOutputIterIndexerT &in_out_iter_indexer, + const ReductionIndexerT &reduction_indexer, + const std::vector &depends) +{ + sycl::event red_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + auto globalRange = sycl::range<1>{iter_nelems * reduction_groups * wg}; + auto localRange = sycl::range<1>{wg}; + auto ndRange = sycl::nd_range<1>(globalRange, localRange); + + if constexpr (can_use_reduce_over_group::value) { + using KernelName = + class kernel_name_token; + + cgh.parallel_for( + ndRange, + ReductionOverGroupWithAtomicFunctor( + arg, res, ReductionOpT(), identity_val, in_out_iter_indexer, + reduction_indexer, reduction_nelems, iter_nelems, + reductions_per_wi)); + } + else { + using SlmT = sycl::local_accessor; + SlmT local_memory = SlmT(localRange, cgh); + + using KernelName = class custom_reduction_wrapper< + kernel_name_token>; + + cgh.parallel_for( + ndRange, + CustomReductionOverGroupWithAtomicFunctor< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT, SlmT>( + arg, res, ReductionOpT(), identity_val, in_out_iter_indexer, + reduction_indexer, local_memory, reduction_nelems, + iter_nelems, reductions_per_wi)); + } + }); + return red_ev; +} + +template +class reduction_over_group_with_atomics_init_krn; + +template +class reduction_seq_krn; + +template +class reduction_over_group_with_atomics_krn; + +typedef sycl::event (*reduction_strided_impl_fn_ptr)( + sycl::queue &, + std::size_t, + std::size_t, + const char *, + char *, + int, + const ssize_t *, + ssize_t, + ssize_t, + int, + const ssize_t *, + ssize_t, + const std::vector &); + +using dpctl::tensor::sycl_utils::choose_workgroup_size; + +template +sycl::event reduction_over_group_with_atomics_strided_impl( + sycl::queue &exec_q, + std::size_t iter_nelems, // number of reductions (num. of rows in a + // matrix when reducing over rows) + std::size_t reduction_nelems, // size of each reduction (length of rows, + // i.e. number of columns) + const char *arg_cp, + char *res_cp, + int iter_nd, + const ssize_t *iter_shape_and_strides, + ssize_t iter_arg_offset, + ssize_t iter_res_offset, + int red_nd, + const ssize_t *reduction_shape_stride, + ssize_t reduction_arg_offset, + const std::vector &depends) +{ + const argTy *arg_tp = reinterpret_cast(arg_cp); + resTy *res_tp = reinterpret_cast(res_cp); + + static constexpr resTy identity_val = + su_ns::Identity::value; + + const sycl::device &d = exec_q.get_device(); + const auto &sg_sizes = d.get_info(); + std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); + + if (reduction_nelems < wg) { + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer; + using ReductionIndexerT = dpctl::tensor::offset_utils::StridedIndexer; + + const InputOutputIterIndexerT in_out_iter_indexer{ + iter_nd, iter_arg_offset, iter_res_offset, iter_shape_and_strides}; + const ReductionIndexerT reduction_indexer{red_nd, reduction_arg_offset, + reduction_shape_stride}; + + sycl::event comp_ev = + sequential_reduction( + exec_q, arg_tp, res_tp, identity_val, iter_nelems, + reduction_nelems, in_out_iter_indexer, reduction_indexer, + depends); + + return comp_ev; + } + else { + sycl::event res_init_ev = exec_q.submit([&](sycl::handler &cgh) { + using IndexerT = + dpctl::tensor::offset_utils::UnpackedStridedIndexer; + + const ssize_t *const &res_shape = iter_shape_and_strides; + const ssize_t *const &res_strides = + iter_shape_and_strides + 2 * iter_nd; + const IndexerT res_indexer(iter_nd, iter_res_offset, res_shape, + res_strides); + using InitKernelName = + class reduction_over_group_with_atomics_init_krn; + cgh.depends_on(depends); + + cgh.parallel_for( + sycl::range<1>(iter_nelems), [=](sycl::id<1> id) { + auto res_offset = res_indexer(id[0]); + res_tp[res_offset] = identity_val; + }); + }); + + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer; + using ReductionIndexerT = dpctl::tensor::offset_utils::StridedIndexer; + + const InputOutputIterIndexerT in_out_iter_indexer{ + iter_nd, iter_arg_offset, iter_res_offset, iter_shape_and_strides}; + const ReductionIndexerT reduction_indexer{red_nd, reduction_arg_offset, + reduction_shape_stride}; + + static constexpr std::size_t preferred_reductions_per_wi = 8; + std::size_t reductions_per_wi = + (reduction_nelems < preferred_reductions_per_wi * wg) + ? std::max(1, (reduction_nelems + wg - 1) / wg) + : preferred_reductions_per_wi; + + std::size_t reduction_groups = + (reduction_nelems + reductions_per_wi * wg - 1) / + (reductions_per_wi * wg); + + sycl::event comp_ev = + submit_atomic_reduction( + exec_q, arg_tp, res_tp, identity_val, wg, iter_nelems, + reduction_nelems, reductions_per_wi, reduction_groups, + in_out_iter_indexer, reduction_indexer, {res_init_ev}); + + return comp_ev; + } +} + +// Contig + +typedef sycl::event (*reduction_contig_impl_fn_ptr)( + sycl::queue &, + std::size_t, + std::size_t, + const char *, + char *, + ssize_t, + ssize_t, + ssize_t, + const std::vector &); + +/* @brief Reduce rows in a matrix */ +template +sycl::event reduction_axis1_over_group_with_atomics_contig_impl( + sycl::queue &exec_q, + std::size_t iter_nelems, // number of reductions (num. of rows in a + // matrix when reducing over rows) + std::size_t reduction_nelems, // size of each reduction (length of rows, + // i.e. number of columns) + const char *arg_cp, + char *res_cp, + ssize_t iter_arg_offset, + ssize_t iter_res_offset, + ssize_t reduction_arg_offset, + const std::vector &depends) +{ + const argTy *arg_tp = reinterpret_cast(arg_cp) + + iter_arg_offset + reduction_arg_offset; + resTy *res_tp = reinterpret_cast(res_cp) + iter_res_offset; + + static constexpr resTy identity_val = + su_ns::Identity::value; + + const sycl::device &d = exec_q.get_device(); + const auto &sg_sizes = d.get_info(); + std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); + + if (reduction_nelems < wg) { + using InputIterIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer; + using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + InputIterIndexerT, NoOpIndexerT>; + using ReductionIndexerT = NoOpIndexerT; + + const InputOutputIterIndexerT in_out_iter_indexer{ + InputIterIndexerT{/* size */ iter_nelems, + /* step */ reduction_nelems}, + NoOpIndexerT{}}; + static constexpr ReductionIndexerT reduction_indexer{}; + + sycl::event comp_ev = + sequential_reduction( + exec_q, arg_tp, res_tp, identity_val, iter_nelems, + reduction_nelems, in_out_iter_indexer, reduction_indexer, + depends); + + return comp_ev; + } + else { + sycl::event res_init_ev = exec_q.fill( + res_tp, resTy(identity_val), iter_nelems, depends); + + using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using RowsIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + RowsIndexerT, NoOpIndexerT>; + using ReductionIndexerT = NoOpIndexerT; + + const RowsIndexerT rows_indexer{/* size */ iter_nelems, + /* step */ reduction_nelems}; + static constexpr NoOpIndexerT result_indexer{}; + const InputOutputIterIndexerT in_out_iter_indexer{rows_indexer, + result_indexer}; + static constexpr ReductionIndexerT reduction_indexer{}; + + static constexpr std::size_t preferred_reductions_per_wi = 8; + std::size_t reductions_per_wi = + (reduction_nelems < preferred_reductions_per_wi * wg) + ? std::max(1, (reduction_nelems + wg - 1) / wg) + : preferred_reductions_per_wi; + + std::size_t reduction_groups = + (reduction_nelems + reductions_per_wi * wg - 1) / + (reductions_per_wi * wg); + + sycl::event comp_ev = + submit_atomic_reduction( + exec_q, arg_tp, res_tp, identity_val, wg, iter_nelems, + reduction_nelems, reductions_per_wi, reduction_groups, + in_out_iter_indexer, reduction_indexer, {res_init_ev}); + + return comp_ev; + } +} + +/* @brief Reduce rows in a matrix */ +template +sycl::event reduction_axis0_over_group_with_atomics_contig_impl( + sycl::queue &exec_q, + std::size_t iter_nelems, // number of reductions (num. of cols in a + // matrix when reducing over cols) + std::size_t reduction_nelems, // size of each reduction (length of cols, + // i.e. number of rows) + const char *arg_cp, + char *res_cp, + ssize_t iter_arg_offset, + ssize_t iter_res_offset, + ssize_t reduction_arg_offset, + const std::vector &depends) +{ + const argTy *arg_tp = reinterpret_cast(arg_cp) + + iter_arg_offset + reduction_arg_offset; + resTy *res_tp = reinterpret_cast(res_cp) + iter_res_offset; + + static constexpr resTy identity_val = + su_ns::Identity::value; + + const sycl::device &d = exec_q.get_device(); + const auto &sg_sizes = d.get_info(); + std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); + + if (reduction_nelems < wg) { + using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + NoOpIndexerT, NoOpIndexerT>; + using ReductionIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer; + + const InputOutputIterIndexerT in_out_iter_indexer{NoOpIndexerT{}, + NoOpIndexerT{}}; + const ReductionIndexerT reduction_indexer{/* size */ reduction_nelems, + /* step */ iter_nelems}; + + sycl::event comp_ev = + sequential_reduction( + exec_q, arg_tp, res_tp, identity_val, iter_nelems, + reduction_nelems, in_out_iter_indexer, reduction_indexer, + depends); + + return comp_ev; + } + else { + sycl::event res_init_ev = exec_q.fill( + res_tp, resTy(identity_val), iter_nelems, depends); + + using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using ColsIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + NoOpIndexerT, NoOpIndexerT>; + using ReductionIndexerT = ColsIndexerT; + + static constexpr NoOpIndexerT columns_indexer{}; + static constexpr NoOpIndexerT result_indexer{}; + const InputOutputIterIndexerT in_out_iter_indexer{columns_indexer, + result_indexer}; + const ReductionIndexerT reduction_indexer{/* size */ reduction_nelems, + /* step */ iter_nelems}; + + static constexpr std::size_t preferred_reductions_per_wi = 8; + std::size_t reductions_per_wi = + (reduction_nelems < preferred_reductions_per_wi * wg) + ? std::max(1, (reduction_nelems + wg - 1) / wg) + : preferred_reductions_per_wi; + + std::size_t reduction_groups = + (reduction_nelems + reductions_per_wi * wg - 1) / + (reductions_per_wi * wg); + + sycl::event comp_ev = + submit_atomic_reduction( + exec_q, arg_tp, res_tp, identity_val, wg, iter_nelems, + reduction_nelems, reductions_per_wi, reduction_groups, + in_out_iter_indexer, reduction_indexer, {res_init_ev}); + + return comp_ev; + } +} + +/* = Reduction, using sycl::reduce_over_group, but not using atomic_ref = */ + +template class kernel_name_token> +sycl::event submit_no_atomic_reduction( + sycl::queue &exec_q, + const argTy *arg, + resTy *res, + resTy identity_val, + std::size_t wg, + std::size_t iter_nelems, + std::size_t reduction_nelems, + std::size_t reductions_per_wi, + std::size_t reduction_groups, + const InputOutputIterIndexerT &in_out_iter_indexer, + const ReductionIndexerT &reduction_indexer, + const std::vector &depends) +{ + sycl::event red_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + auto globalRange = sycl::range<1>{iter_nelems * reduction_groups * wg}; + auto localRange = sycl::range<1>{wg}; + auto ndRange = sycl::nd_range<1>(globalRange, localRange); + + if constexpr (can_use_reduce_over_group::value) { + using KernelName = + class kernel_name_token; + + cgh.parallel_for( + ndRange, + ReductionOverGroupNoAtomicFunctor( + arg, res, ReductionOpT(), identity_val, in_out_iter_indexer, + reduction_indexer, reduction_nelems, iter_nelems, + reductions_per_wi)); + } + else { + using SlmT = sycl::local_accessor; + SlmT local_memory = SlmT(localRange, cgh); + using KernelName = class custom_reduction_wrapper< + kernel_name_token>; + + cgh.parallel_for( + ndRange, + CustomReductionOverGroupNoAtomicFunctor< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT, SlmT>( + arg, res, ReductionOpT(), identity_val, in_out_iter_indexer, + reduction_indexer, local_memory, reduction_nelems, + iter_nelems, reductions_per_wi)); + } + }); + return red_ev; +} + +template +class reduction_over_group_temps_krn; + +typedef sycl::event (*reduction_strided_impl_fn_ptr)( + sycl::queue &, + std::size_t, + std::size_t, + const char *, + char *, + int, + const ssize_t *, + ssize_t, + ssize_t, + int, + const ssize_t *, + ssize_t, + const std::vector &); + +template +class reduction_over_group_temps_empty_krn; + +template +sycl::event reduction_over_group_temps_strided_impl( + sycl::queue &exec_q, + std::size_t iter_nelems, // number of reductions (num. of rows in a + // matrix when reducing over rows) + std::size_t reduction_nelems, // size of each reduction (length of rows, + // i.e. number of columns) + const char *arg_cp, + char *res_cp, + int iter_nd, + const ssize_t *iter_shape_and_strides, + ssize_t iter_arg_offset, + ssize_t iter_res_offset, + int red_nd, + const ssize_t *reduction_shape_stride, + ssize_t reduction_arg_offset, + const std::vector &depends) +{ + const argTy *arg_tp = reinterpret_cast(arg_cp); + resTy *res_tp = reinterpret_cast(res_cp); + + static constexpr resTy identity_val = + su_ns::Identity::value; + + if (reduction_nelems == 0) { + sycl::event res_init_ev = exec_q.submit([&](sycl::handler &cgh) { + using IndexerT = + dpctl::tensor::offset_utils::UnpackedStridedIndexer; + + const ssize_t *const &res_shape = iter_shape_and_strides; + const ssize_t *const &res_strides = + iter_shape_and_strides + 2 * iter_nd; + const IndexerT res_indexer(iter_nd, iter_res_offset, res_shape, + res_strides); + using InitKernelName = + class reduction_over_group_temps_empty_krn; + cgh.depends_on(depends); + + cgh.parallel_for( + sycl::range<1>(iter_nelems), [=](sycl::id<1> id) { + auto res_offset = res_indexer(id[0]); + res_tp[res_offset] = identity_val; + }); + }); + + return res_init_ev; + } + + const sycl::device &d = exec_q.get_device(); + const auto &sg_sizes = d.get_info(); + std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); + + if (reduction_nelems < wg) { + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer; + using ReductionIndexerT = dpctl::tensor::offset_utils::StridedIndexer; + + const InputOutputIterIndexerT in_out_iter_indexer{ + iter_nd, iter_arg_offset, iter_res_offset, iter_shape_and_strides}; + const ReductionIndexerT reduction_indexer{red_nd, reduction_arg_offset, + reduction_shape_stride}; + + sycl::event comp_ev = + sequential_reduction( + exec_q, arg_tp, res_tp, identity_val, iter_nelems, + reduction_nelems, in_out_iter_indexer, reduction_indexer, + depends); + + return comp_ev; + } + + static constexpr std::size_t preferred_reductions_per_wi = 8; + // prevents running out of resources on CPU + std::size_t max_wg = reduction_detail::get_work_group_size(d); + + std::size_t reductions_per_wi(preferred_reductions_per_wi); + if (reduction_nelems <= preferred_reductions_per_wi * max_wg) { + // Perform reduction using one 1 work-group per iteration, + // can output directly to res + + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer; + using ReductionIndexerT = dpctl::tensor::offset_utils::StridedIndexer; + + const InputOutputIterIndexerT in_out_iter_indexer{ + iter_nd, iter_arg_offset, iter_res_offset, iter_shape_and_strides}; + const ReductionIndexerT reduction_indexer{red_nd, reduction_arg_offset, + reduction_shape_stride}; + + if (iter_nelems == 1) { + // increase GPU occupancy + wg = max_wg; + } + reductions_per_wi = + std::max(1, (reduction_nelems + wg - 1) / wg); + + std::size_t reduction_groups = + (reduction_nelems + reductions_per_wi * wg - 1) / + (reductions_per_wi * wg); + assert(reduction_groups == 1); + + sycl::event comp_ev = submit_no_atomic_reduction< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT, reduction_over_group_temps_krn>( + exec_q, arg_tp, res_tp, identity_val, wg, iter_nelems, + reduction_nelems, reductions_per_wi, reduction_groups, + in_out_iter_indexer, reduction_indexer, depends); + + return comp_ev; + } + else { + // more than one work-groups is needed, requires a temporary + std::size_t reduction_groups = + (reduction_nelems + preferred_reductions_per_wi * wg - 1) / + (preferred_reductions_per_wi * wg); + assert(reduction_groups > 1); + + std::size_t second_iter_reduction_groups_ = + (reduction_groups + preferred_reductions_per_wi * wg - 1) / + (preferred_reductions_per_wi * wg); + + const std::size_t tmp_alloc_size = + iter_nelems * (reduction_groups + second_iter_reduction_groups_); + auto tmp_owner = dpctl::tensor::alloc_utils::smart_malloc_device( + tmp_alloc_size, exec_q); + + resTy *partially_reduced_tmp = tmp_owner.get(); + resTy *partially_reduced_tmp2 = + partially_reduced_tmp + reduction_groups * iter_nelems; + ; + + sycl::event first_reduction_ev; + { + using InputIndexerT = dpctl::tensor::offset_utils::StridedIndexer; + using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + InputIndexerT, ResIndexerT>; + using ReductionIndexerT = + dpctl::tensor::offset_utils::StridedIndexer; + + // Only 2*iter_nd entries describing shape and strides of + // iterated dimensions of input array from + // iter_shape_and_strides are going to be accessed by + // inp_indexer + const InputIndexerT inp_indexer(iter_nd, iter_arg_offset, + iter_shape_and_strides); + static constexpr ResIndexerT noop_tmp_indexer{}; + + const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer, + noop_tmp_indexer}; + const ReductionIndexerT reduction_indexer{ + red_nd, reduction_arg_offset, reduction_shape_stride}; + + first_reduction_ev = submit_no_atomic_reduction< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT, reduction_over_group_temps_krn>( + exec_q, arg_tp, partially_reduced_tmp, identity_val, wg, + iter_nelems, reduction_nelems, preferred_reductions_per_wi, + reduction_groups, in_out_iter_indexer, reduction_indexer, + depends); + } + + std::size_t remaining_reduction_nelems = reduction_groups; + + resTy *temp_arg = partially_reduced_tmp; + resTy *temp2_arg = partially_reduced_tmp2; + sycl::event dependent_ev = first_reduction_ev; + + while (remaining_reduction_nelems > + preferred_reductions_per_wi * max_wg) { + std::size_t reduction_groups_ = + (remaining_reduction_nelems + preferred_reductions_per_wi * wg - + 1) / + (preferred_reductions_per_wi * wg); + assert(reduction_groups_ > 1); + + // keep reducing + sycl::event partial_reduction_ev; + { + using InputIndexerT = + dpctl::tensor::offset_utils::Strided1DIndexer; + using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + InputIndexerT, ResIndexerT>; + using ReductionIndexerT = + dpctl::tensor::offset_utils::NoOpIndexer; + + const InputIndexerT inp_indexer{/* size */ iter_nelems, + /* step */ reduction_groups_}; + static constexpr ResIndexerT res_iter_indexer{}; + + const InputOutputIterIndexerT in_out_iter_indexer{ + inp_indexer, res_iter_indexer}; + static constexpr ReductionIndexerT reduction_indexer{}; + + partial_reduction_ev = submit_no_atomic_reduction< + resTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT, reduction_over_group_temps_krn>( + exec_q, temp_arg, temp2_arg, identity_val, wg, iter_nelems, + remaining_reduction_nelems, preferred_reductions_per_wi, + reduction_groups_, in_out_iter_indexer, reduction_indexer, + {dependent_ev}); + } + + remaining_reduction_nelems = reduction_groups_; + std::swap(temp_arg, temp2_arg); + dependent_ev = std::move(partial_reduction_ev); + } + + // final reduction to res + using InputIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer; + using ResIndexerT = dpctl::tensor::offset_utils::UnpackedStridedIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + InputIndexerT, ResIndexerT>; + using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + + const InputIndexerT inp_indexer{/* size */ iter_nelems, + /* step */ remaining_reduction_nelems}; + const ResIndexerT res_iter_indexer{ + iter_nd, iter_res_offset, + /* shape */ iter_shape_and_strides, + /* strides */ iter_shape_and_strides + 2 * iter_nd}; + + const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer, + res_iter_indexer}; + static constexpr ReductionIndexerT reduction_indexer{}; + + wg = max_wg; + reductions_per_wi = std::max( + 1, (remaining_reduction_nelems + wg - 1) / wg); + + reduction_groups = + (remaining_reduction_nelems + reductions_per_wi * wg - 1) / + (reductions_per_wi * wg); + assert(reduction_groups == 1); + + sycl::event final_reduction_ev = submit_no_atomic_reduction< + resTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT, reduction_over_group_temps_krn>( + exec_q, temp_arg, res_tp, identity_val, wg, iter_nelems, + remaining_reduction_nelems, reductions_per_wi, reduction_groups, + in_out_iter_indexer, reduction_indexer, {dependent_ev}); + + sycl::event cleanup_host_task_event = + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {final_reduction_ev}, tmp_owner); + + // FIXME: do not return host-task event + // Instead collect all host-tasks to a list + + return cleanup_host_task_event; + } +} + +template +sycl::event reduction_axis1_over_group_temps_contig_impl( + sycl::queue &exec_q, + std::size_t iter_nelems, // number of reductions (num. of rows in a + // matrix when reducing over rows) + std::size_t reduction_nelems, // size of each reduction (length of rows, + // i.e. number of columns) + const char *arg_cp, + char *res_cp, + ssize_t iter_arg_offset, + ssize_t iter_res_offset, + ssize_t reduction_arg_offset, + const std::vector &depends) +{ + const argTy *arg_tp = reinterpret_cast(arg_cp) + + iter_arg_offset + reduction_arg_offset; + resTy *res_tp = reinterpret_cast(res_cp) + iter_res_offset; + + static constexpr resTy identity_val = + su_ns::Identity::value; + + if (reduction_nelems == 0) { + sycl::event res_init_ev = exec_q.fill( + res_tp, resTy(identity_val), iter_nelems, depends); + + return res_init_ev; + } + + const sycl::device &d = exec_q.get_device(); + const auto &sg_sizes = d.get_info(); + std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); + + if (reduction_nelems < wg) { + using InputIterIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer; + using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + InputIterIndexerT, NoOpIndexerT>; + using ReductionIndexerT = NoOpIndexerT; + + const InputOutputIterIndexerT in_out_iter_indexer{ + InputIterIndexerT{/* size */ iter_nelems, + /* step */ reduction_nelems}, + NoOpIndexerT{}}; + static constexpr ReductionIndexerT reduction_indexer{}; + + sycl::event comp_ev = + sequential_reduction( + exec_q, arg_tp, res_tp, identity_val, iter_nelems, + reduction_nelems, in_out_iter_indexer, reduction_indexer, + depends); + + return comp_ev; + } + + static constexpr std::size_t preferred_reductions_per_wi = 8; + // prevents running out of resources on CPU + std::size_t max_wg = reduction_detail::get_work_group_size(d); + + std::size_t reductions_per_wi(preferred_reductions_per_wi); + if (reduction_nelems <= preferred_reductions_per_wi * max_wg) { + // Perform reduction using one 1 work-group per iteration, + // can output directly to res + + using InputIterIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer; + using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + InputIterIndexerT, NoOpIndexerT>; + using ReductionIndexerT = NoOpIndexerT; + + const InputOutputIterIndexerT in_out_iter_indexer{ + InputIterIndexerT{/* size */ iter_nelems, + /* step */ reduction_nelems}, + NoOpIndexerT{}}; + static constexpr ReductionIndexerT reduction_indexer{}; + + if (iter_nelems == 1) { + // increase GPU occupancy + wg = max_wg; + } + reductions_per_wi = + std::max(1, (reduction_nelems + wg - 1) / wg); + + std::size_t reduction_groups = + (reduction_nelems + reductions_per_wi * wg - 1) / + (reductions_per_wi * wg); + assert(reduction_groups == 1); + + sycl::event comp_ev = submit_no_atomic_reduction< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT, reduction_over_group_temps_krn>( + exec_q, arg_tp, res_tp, identity_val, wg, iter_nelems, + reduction_nelems, reductions_per_wi, reduction_groups, + in_out_iter_indexer, reduction_indexer, depends); + + return comp_ev; + } + else { + // more than one work-groups is needed, requires a temporary + std::size_t reduction_groups = + (reduction_nelems + preferred_reductions_per_wi * wg - 1) / + (preferred_reductions_per_wi * wg); + assert(reduction_groups > 1); + + std::size_t second_iter_reduction_groups_ = + (reduction_groups + preferred_reductions_per_wi * wg - 1) / + (preferred_reductions_per_wi * wg); + + const std::size_t tmp_alloc_size = + iter_nelems * (reduction_groups + second_iter_reduction_groups_); + auto tmp_owner = dpctl::tensor::alloc_utils::smart_malloc_device( + tmp_alloc_size, exec_q); + resTy *partially_reduced_tmp = tmp_owner.get(); + resTy *partially_reduced_tmp2 = + partially_reduced_tmp + reduction_groups * iter_nelems; + + sycl::event first_reduction_ev; + { + using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using RowsIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + RowsIndexerT, NoOpIndexerT>; + using ReductionIndexerT = NoOpIndexerT; + + const RowsIndexerT rows_indexer{/* size */ iter_nelems, + /* step */ reduction_nelems}; + static constexpr NoOpIndexerT noop_tmp_indexer{}; + const InputOutputIterIndexerT in_out_iter_indexer{rows_indexer, + noop_tmp_indexer}; + static constexpr ReductionIndexerT reduction_indexer{}; + + first_reduction_ev = submit_no_atomic_reduction< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT, reduction_over_group_temps_krn>( + exec_q, arg_tp, partially_reduced_tmp, identity_val, wg, + iter_nelems, reduction_nelems, preferred_reductions_per_wi, + reduction_groups, in_out_iter_indexer, reduction_indexer, + depends); + } + + std::size_t remaining_reduction_nelems = reduction_groups; + + resTy *temp_arg = partially_reduced_tmp; + resTy *temp2_arg = partially_reduced_tmp2; + sycl::event dependent_ev = first_reduction_ev; + + while (remaining_reduction_nelems > + preferred_reductions_per_wi * max_wg) { + std::size_t reduction_groups_ = + (remaining_reduction_nelems + preferred_reductions_per_wi * wg - + 1) / + (preferred_reductions_per_wi * wg); + assert(reduction_groups_ > 1); + + // keep reducing + using InputIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer; + using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + InputIndexerT, ResIndexerT>; + using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + + const InputIndexerT inp_indexer{/* size */ iter_nelems, + /* step */ reduction_groups_}; + static constexpr ResIndexerT res_iter_indexer{}; + + const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer, + res_iter_indexer}; + static constexpr ReductionIndexerT reduction_indexer{}; + + sycl::event partial_reduction_ev = submit_no_atomic_reduction< + resTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT, reduction_over_group_temps_krn>( + exec_q, temp_arg, temp2_arg, identity_val, wg, iter_nelems, + remaining_reduction_nelems, preferred_reductions_per_wi, + reduction_groups_, in_out_iter_indexer, reduction_indexer, + {dependent_ev}); + + remaining_reduction_nelems = reduction_groups_; + std::swap(temp_arg, temp2_arg); + dependent_ev = std::move(partial_reduction_ev); + } + + // final reduction to res + using InputIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer; + using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + InputIndexerT, ResIndexerT>; + using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + + const InputIndexerT inp_indexer{/* size */ iter_nelems, + /* step */ remaining_reduction_nelems}; + static constexpr ResIndexerT res_iter_indexer{}; + + const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer, + res_iter_indexer}; + static constexpr ReductionIndexerT reduction_indexer{}; + + wg = max_wg; + reductions_per_wi = std::max( + 1, (remaining_reduction_nelems + wg - 1) / wg); + + reduction_groups = + (remaining_reduction_nelems + reductions_per_wi * wg - 1) / + (reductions_per_wi * wg); + assert(reduction_groups == 1); + + sycl::event final_reduction_ev = submit_no_atomic_reduction< + resTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT, reduction_over_group_temps_krn>( + exec_q, temp_arg, res_tp, identity_val, wg, iter_nelems, + remaining_reduction_nelems, reductions_per_wi, reduction_groups, + in_out_iter_indexer, reduction_indexer, {dependent_ev}); + + sycl::event cleanup_host_task_event = + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {final_reduction_ev}, tmp_owner); + + // FIXME: do not return host-task event + // Instead collect all host-tasks to a list + + return cleanup_host_task_event; + } +} + +template +sycl::event reduction_axis0_over_group_temps_contig_impl( + sycl::queue &exec_q, + std::size_t iter_nelems, // number of reductions (num. of rows in a + // matrix when reducing over rows) + std::size_t reduction_nelems, // size of each reduction (length of rows, + // i.e. number of columns) + const char *arg_cp, + char *res_cp, + ssize_t iter_arg_offset, + ssize_t iter_res_offset, + ssize_t reduction_arg_offset, + const std::vector &depends) +{ + const argTy *arg_tp = reinterpret_cast(arg_cp) + + iter_arg_offset + reduction_arg_offset; + resTy *res_tp = reinterpret_cast(res_cp) + iter_res_offset; + + static constexpr resTy identity_val = + su_ns::Identity::value; + + if (reduction_nelems == 0) { + sycl::event res_init_ev = exec_q.fill( + res_tp, resTy(identity_val), iter_nelems, depends); + + return res_init_ev; + } + + const sycl::device &d = exec_q.get_device(); + const auto &sg_sizes = d.get_info(); + std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); + + if (reduction_nelems < wg) { + using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + NoOpIndexerT, NoOpIndexerT>; + using ReductionIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer; + + const InputOutputIterIndexerT in_out_iter_indexer{NoOpIndexerT{}, + NoOpIndexerT{}}; + const ReductionIndexerT reduction_indexer{/* size */ reduction_nelems, + /* step */ iter_nelems}; + + sycl::event comp_ev = + sequential_reduction( + exec_q, arg_tp, res_tp, identity_val, iter_nelems, + reduction_nelems, in_out_iter_indexer, reduction_indexer, + depends); + + return comp_ev; + } + + static constexpr std::size_t preferred_reductions_per_wi = 8; + // prevents running out of resources on CPU + std::size_t max_wg = reduction_detail::get_work_group_size(d); + + std::size_t reductions_per_wi(preferred_reductions_per_wi); + if (reduction_nelems <= preferred_reductions_per_wi * max_wg) { + // Perform reduction using one 1 work-group per iteration, + // can output directly to res + + using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using ColsIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + NoOpIndexerT, NoOpIndexerT>; + using ReductionIndexerT = ColsIndexerT; + + static constexpr NoOpIndexerT columns_indexer{}; + static constexpr NoOpIndexerT result_indexer{}; + const InputOutputIterIndexerT in_out_iter_indexer{columns_indexer, + result_indexer}; + const ReductionIndexerT reduction_indexer{/* size */ reduction_nelems, + /* step */ iter_nelems}; + + if (iter_nelems == 1) { + // increase GPU occupancy + wg = max_wg; + } + reductions_per_wi = + std::max(1, (reduction_nelems + wg - 1) / wg); + + std::size_t reduction_groups = + (reduction_nelems + reductions_per_wi * wg - 1) / + (reductions_per_wi * wg); + assert(reduction_groups == 1); + + sycl::event comp_ev = submit_no_atomic_reduction< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT, reduction_over_group_temps_krn>( + exec_q, arg_tp, res_tp, identity_val, wg, iter_nelems, + reduction_nelems, reductions_per_wi, reduction_groups, + in_out_iter_indexer, reduction_indexer, depends); + + return comp_ev; + } + else { + // more than one work-groups is needed, requires a temporary + std::size_t reduction_groups = + (reduction_nelems + preferred_reductions_per_wi * wg - 1) / + (preferred_reductions_per_wi * wg); + assert(reduction_groups > 1); + + std::size_t second_iter_reduction_groups_ = + (reduction_groups + preferred_reductions_per_wi * wg - 1) / + (preferred_reductions_per_wi * wg); + + const std::size_t tmp_alloc_size = + iter_nelems * (reduction_groups + second_iter_reduction_groups_); + + auto tmp_owner = dpctl::tensor::alloc_utils::smart_malloc_device( + tmp_alloc_size, exec_q); + + resTy *partially_reduced_tmp = tmp_owner.get(); + resTy *partially_reduced_tmp2 = + partially_reduced_tmp + reduction_groups * iter_nelems; + + sycl::event first_reduction_ev; + { + using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using ColsIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + NoOpIndexerT, NoOpIndexerT>; + using ReductionIndexerT = ColsIndexerT; + + static constexpr NoOpIndexerT columns_indexer{}; + static constexpr NoOpIndexerT noop_tmp_indexer{}; + const InputOutputIterIndexerT in_out_iter_indexer{columns_indexer, + noop_tmp_indexer}; + const ReductionIndexerT reduction_indexer{ + /* size */ reduction_nelems, + /* step */ iter_nelems}; + + first_reduction_ev = submit_no_atomic_reduction< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT, reduction_over_group_temps_krn>( + exec_q, arg_tp, partially_reduced_tmp, identity_val, wg, + iter_nelems, reduction_nelems, preferred_reductions_per_wi, + reduction_groups, in_out_iter_indexer, reduction_indexer, + depends); + } + + std::size_t remaining_reduction_nelems = reduction_groups; + + resTy *temp_arg = partially_reduced_tmp; + resTy *temp2_arg = partially_reduced_tmp2; + sycl::event dependent_ev = first_reduction_ev; + + while (remaining_reduction_nelems > + preferred_reductions_per_wi * max_wg) { + std::size_t reduction_groups_ = + (remaining_reduction_nelems + preferred_reductions_per_wi * wg - + 1) / + (preferred_reductions_per_wi * wg); + assert(reduction_groups_ > 1); + + // keep reducing + using InputIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer; + using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + InputIndexerT, ResIndexerT>; + using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + + const InputIndexerT inp_indexer{/* size */ iter_nelems, + /* step */ reduction_groups_}; + static constexpr ResIndexerT res_iter_indexer{}; + + const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer, + res_iter_indexer}; + static constexpr ReductionIndexerT reduction_indexer{}; + + sycl::event partial_reduction_ev = submit_no_atomic_reduction< + resTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT, reduction_over_group_temps_krn>( + exec_q, temp_arg, temp2_arg, identity_val, wg, iter_nelems, + remaining_reduction_nelems, preferred_reductions_per_wi, + reduction_groups_, in_out_iter_indexer, reduction_indexer, + {dependent_ev}); + + remaining_reduction_nelems = reduction_groups_; + std::swap(temp_arg, temp2_arg); + dependent_ev = std::move(partial_reduction_ev); + } + + // final reduction to res + using InputIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer; + using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + InputIndexerT, ResIndexerT>; + using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + + const InputIndexerT inp_indexer{/* size */ iter_nelems, + /* step */ remaining_reduction_nelems}; + static constexpr ResIndexerT res_iter_indexer{}; + + const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer, + res_iter_indexer}; + static constexpr ReductionIndexerT reduction_indexer{}; + + wg = max_wg; + reductions_per_wi = std::max( + 1, (remaining_reduction_nelems + wg - 1) / wg); + + reduction_groups = + (remaining_reduction_nelems + reductions_per_wi * wg - 1) / + (reductions_per_wi * wg); + assert(reduction_groups == 1); + + sycl::event final_reduction_ev = submit_no_atomic_reduction< + resTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT, reduction_over_group_temps_krn>( + exec_q, temp_arg, res_tp, identity_val, wg, iter_nelems, + remaining_reduction_nelems, reductions_per_wi, reduction_groups, + in_out_iter_indexer, reduction_indexer, {dependent_ev}); + + sycl::event cleanup_host_task_event = + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {final_reduction_ev}, tmp_owner); + + // FIXME: do not return host-task event + // Instead collect all host-tasks to a list + + return cleanup_host_task_event; + } +} + +// Argmax and Argmin + +/* Sequential search reduction */ + +template +struct SequentialSearchReduction +{ +private: + const argT *inp_ = nullptr; + outT *out_ = nullptr; + ReductionOp reduction_op_; + argT identity_; + IdxReductionOp idx_reduction_op_; + outT idx_identity_; + InputOutputIterIndexerT inp_out_iter_indexer_; + InputRedIndexerT inp_reduced_dims_indexer_; + std::size_t reduction_max_gid_ = 0; + +public: + SequentialSearchReduction( + const argT *inp, + outT *res, + const ReductionOp &reduction_op, + const argT &identity_val, + const IdxReductionOp &idx_reduction_op, + const outT &idx_identity_val, + const InputOutputIterIndexerT &arg_res_iter_indexer, + const InputRedIndexerT &arg_reduced_dims_indexer, + std::size_t reduction_size) + : inp_(inp), out_(res), reduction_op_(reduction_op), + identity_(identity_val), idx_reduction_op_(idx_reduction_op), + idx_identity_(idx_identity_val), + inp_out_iter_indexer_(arg_res_iter_indexer), + inp_reduced_dims_indexer_(arg_reduced_dims_indexer), + reduction_max_gid_(reduction_size) + { + } + + void operator()(sycl::id<1> id) const + { + + auto const &inp_out_iter_offsets_ = inp_out_iter_indexer_(id[0]); + const ssize_t &inp_iter_offset = + inp_out_iter_offsets_.get_first_offset(); + const ssize_t &out_iter_offset = + inp_out_iter_offsets_.get_second_offset(); + + argT red_val(identity_); + outT idx_val(idx_identity_); + for (std::size_t m = 0; m < reduction_max_gid_; ++m) { + const ssize_t inp_reduction_offset = inp_reduced_dims_indexer_(m); + const ssize_t inp_offset = inp_iter_offset + inp_reduction_offset; + + argT val = inp_[inp_offset]; + if (val == red_val) { + idx_val = idx_reduction_op_(idx_val, static_cast(m)); + } + else { + if constexpr (su_ns::IsMinimum::value) { + using dpctl::tensor::type_utils::is_complex; + if constexpr (is_complex::value) { + using dpctl::tensor::math_utils::less_complex; + // less_complex always returns false for NaNs, so check + if (less_complex(val, red_val) || + std::isnan(std::real(val)) || + std::isnan(std::imag(val))) { + red_val = val; + idx_val = static_cast(m); + } + } + else if constexpr (std::is_floating_point_v || + std::is_same_v) { + if (val < red_val || std::isnan(val)) { + red_val = val; + idx_val = static_cast(m); + } + } + else { + if (val < red_val) { + red_val = val; + idx_val = static_cast(m); + } + } + } + else if constexpr (su_ns::IsMaximum::value) { + using dpctl::tensor::type_utils::is_complex; + if constexpr (is_complex::value) { + using dpctl::tensor::math_utils::greater_complex; + if (greater_complex(val, red_val) || + std::isnan(std::real(val)) || + std::isnan(std::imag(val))) { + red_val = val; + idx_val = static_cast(m); + } + } + else if constexpr (std::is_floating_point_v || + std::is_same_v) { + if (val > red_val || std::isnan(val)) { + red_val = val; + idx_val = static_cast(m); + } + } + else { + if (val > red_val) { + red_val = val; + idx_val = static_cast(m); + } + } + } + } + } + out_[out_iter_offset] = idx_val; + } +}; + +/* = Search reduction using reduce_over_group*/ + +template +struct SearchReduction +{ +private: + const argT *inp_ = nullptr; + argT *vals_ = nullptr; + const outT *inds_ = nullptr; + outT *out_ = nullptr; + ReductionOp reduction_op_; + argT identity_; + IdxReductionOp idx_reduction_op_; + outT idx_identity_; + InputOutputIterIndexerT inp_out_iter_indexer_; + InputRedIndexerT inp_reduced_dims_indexer_; + std::size_t reduction_max_gid_ = 0; + std::size_t iter_gws_ = 1; + std::size_t reductions_per_wi = 16; + +public: + SearchReduction(const argT *data, + argT *vals, + const outT *inds, + outT *res, + const ReductionOp &reduction_op, + const argT &identity_val, + const IdxReductionOp &idx_reduction_op, + const outT &idx_identity_val, + const InputOutputIterIndexerT &arg_res_iter_indexer, + const InputRedIndexerT &arg_reduced_dims_indexer, + std::size_t reduction_size, + std::size_t iteration_size, + std::size_t reduction_size_per_wi) + : inp_(data), vals_(vals), inds_(inds), out_(res), + reduction_op_(reduction_op), identity_(identity_val), + idx_reduction_op_(idx_reduction_op), idx_identity_(idx_identity_val), + inp_out_iter_indexer_(arg_res_iter_indexer), + inp_reduced_dims_indexer_(arg_reduced_dims_indexer), + reduction_max_gid_(reduction_size), iter_gws_(iteration_size), + reductions_per_wi(reduction_size_per_wi) + { + } + + void operator()(sycl::nd_item<1> it) const + { + const std::size_t reduction_lid = it.get_local_id(0); + const std::size_t wg = + it.get_local_range(0); // 0 <= reduction_lid < wg + + const std::size_t iter_gid = it.get_group(0) % iter_gws_; + const std::size_t reduction_batch_id = it.get_group(0) / iter_gws_; + const std::size_t n_reduction_groups = + it.get_group_range(0) / iter_gws_; + + // work-items operates over input with indices + // inp_data_id = reduction_batch_id * wg * reductions_per_wi + m * wg + // + reduction_lid + // for 0 <= m < reductions_per_wi + + const auto &inp_out_iter_offsets_ = inp_out_iter_indexer_(iter_gid); + const auto &inp_iter_offset = inp_out_iter_offsets_.get_first_offset(); + const auto &out_iter_offset = inp_out_iter_offsets_.get_second_offset(); + + argT local_red_val(identity_); + outT local_idx(idx_identity_); + std::size_t arg_reduce_gid0 = + reduction_lid + reduction_batch_id * wg * reductions_per_wi; + for (std::size_t m = 0; m < reductions_per_wi; ++m) { + std::size_t arg_reduce_gid = arg_reduce_gid0 + m * wg; + + if (arg_reduce_gid < reduction_max_gid_) { + auto inp_reduction_offset = + inp_reduced_dims_indexer_(arg_reduce_gid); + auto inp_offset = inp_iter_offset + inp_reduction_offset; + + argT val = inp_[inp_offset]; + if (val == local_red_val) { + if constexpr (!First) { + local_idx = + idx_reduction_op_(local_idx, inds_[inp_offset]); + } + else { + local_idx = idx_reduction_op_( + local_idx, static_cast(arg_reduce_gid)); + } + } + else { + if constexpr (su_ns::IsMinimum::value) { + if (val < local_red_val) { + local_red_val = val; + if constexpr (!First) { + local_idx = inds_[inp_offset]; + } + else { + local_idx = static_cast(arg_reduce_gid); + } + } + } + else if constexpr (su_ns::IsMaximum::value) { + if (val > local_red_val) { + local_red_val = val; + if constexpr (!First) { + local_idx = inds_[inp_offset]; + } + else { + local_idx = static_cast(arg_reduce_gid); + } + } + } + } + } + } + + auto work_group = it.get_group(); + // This only works if reduction_op_ is from small set of operators + argT red_val_over_wg = sycl::reduce_over_group( + work_group, local_red_val, identity_, reduction_op_); + + if constexpr (std::is_integral_v) { + local_idx = + (red_val_over_wg == local_red_val) ? local_idx : idx_identity_; + } + else { + local_idx = + (red_val_over_wg == local_red_val || + std::isnan(red_val_over_wg) || std::isnan(local_red_val)) + ? local_idx + : idx_identity_; + } + outT idx_over_wg = sycl::reduce_over_group( + work_group, local_idx, idx_identity_, idx_reduction_op_); + + if (work_group.leader()) { + // each group writes to a different memory location + if constexpr (!Last) { + // if not the final reduction, write value corresponding to + // an index to a temporary + vals_[out_iter_offset * n_reduction_groups + + reduction_batch_id] = red_val_over_wg; + } + out_[out_iter_offset * n_reduction_groups + reduction_batch_id] = + idx_over_wg; + } + } +}; + +/* = Search reduction using custom_reduce_over_group*/ + +template +struct CustomSearchReduction +{ +private: + const argT *inp_ = nullptr; + argT *vals_ = nullptr; + const outT *inds_ = nullptr; + outT *out_ = nullptr; + ReductionOp reduction_op_; + argT identity_; + IdxReductionOp idx_reduction_op_; + outT idx_identity_; + InputOutputIterIndexerT inp_out_iter_indexer_; + InputRedIndexerT inp_reduced_dims_indexer_; + SlmT local_mem_; + std::size_t reduction_max_gid_ = 0; + std::size_t iter_gws_ = 1; + std::size_t reductions_per_wi = 16; + +public: + CustomSearchReduction(const argT *data, + argT *vals, + outT *inds, + outT *res, + const ReductionOp &reduction_op, + const argT &identity_val, + const IdxReductionOp &idx_reduction_op, + const outT &idx_identity_val, + const InputOutputIterIndexerT &arg_res_iter_indexer, + const InputRedIndexerT &arg_reduced_dims_indexer, + SlmT local_mem, + std::size_t reduction_size, + std::size_t iteration_size, + std::size_t reduction_size_per_wi) + : inp_(data), vals_(vals), inds_(inds), out_(res), + reduction_op_(reduction_op), identity_(identity_val), + idx_reduction_op_(idx_reduction_op), idx_identity_(idx_identity_val), + inp_out_iter_indexer_(arg_res_iter_indexer), + inp_reduced_dims_indexer_(arg_reduced_dims_indexer), + local_mem_(local_mem), reduction_max_gid_(reduction_size), + iter_gws_(iteration_size), reductions_per_wi(reduction_size_per_wi) + { + } + + void operator()(sycl::nd_item<1> it) const + { + const std::size_t reduction_lid = it.get_local_id(0); + const std::size_t wg = + it.get_local_range(0); // 0 <= reduction_lid < wg + + const std::size_t iter_gid = it.get_group(0) % iter_gws_; + const std::size_t reduction_batch_id = it.get_group(0) / iter_gws_; + const std::size_t n_reduction_groups = + it.get_group_range(0) / iter_gws_; + + // work-items operates over input with indices + // inp_data_id = reduction_batch_id * wg * reductions_per_wi + m * wg + // + reduction_lid + // for 0 <= m < reductions_per_wi + + const auto &inp_out_iter_offsets_ = inp_out_iter_indexer_(iter_gid); + const auto &inp_iter_offset = inp_out_iter_offsets_.get_first_offset(); + const auto &out_iter_offset = inp_out_iter_offsets_.get_second_offset(); + + argT local_red_val(identity_); + outT local_idx(idx_identity_); + std::size_t arg_reduce_gid0 = + reduction_lid + reduction_batch_id * wg * reductions_per_wi; + for (std::size_t m = 0; m < reductions_per_wi; ++m) { + std::size_t arg_reduce_gid = arg_reduce_gid0 + m * wg; + + if (arg_reduce_gid < reduction_max_gid_) { + auto inp_reduction_offset = + inp_reduced_dims_indexer_(arg_reduce_gid); + auto inp_offset = inp_iter_offset + inp_reduction_offset; + + argT val = inp_[inp_offset]; + if (val == local_red_val) { + if constexpr (!First) { + local_idx = + idx_reduction_op_(local_idx, inds_[inp_offset]); + } + else { + local_idx = idx_reduction_op_( + local_idx, static_cast(arg_reduce_gid)); + } + } + else { + if constexpr (su_ns::IsMinimum::value) { + using dpctl::tensor::type_utils::is_complex; + if constexpr (is_complex::value) { + using dpctl::tensor::math_utils::less_complex; + // less_complex always returns false for NaNs, so + // check + if (less_complex(val, local_red_val) || + std::isnan(std::real(val)) || + std::isnan(std::imag(val))) { + local_red_val = val; + if constexpr (!First) { + local_idx = inds_[inp_offset]; + } + else { + local_idx = + static_cast(arg_reduce_gid); + } + } + } + else if constexpr (std::is_floating_point_v || + std::is_same_v) { + if (val < local_red_val || std::isnan(val)) { + local_red_val = val; + if constexpr (!First) { + local_idx = inds_[inp_offset]; + } + else { + local_idx = + static_cast(arg_reduce_gid); + } + } + } + else { + if (val < local_red_val) { + local_red_val = val; + if constexpr (!First) { + local_idx = inds_[inp_offset]; + } + else { + local_idx = + static_cast(arg_reduce_gid); + } + } + } + } + else if constexpr (su_ns::IsMaximum::value) { + using dpctl::tensor::type_utils::is_complex; + if constexpr (is_complex::value) { + using dpctl::tensor::math_utils::greater_complex; + if (greater_complex(val, local_red_val) || + std::isnan(std::real(val)) || + std::isnan(std::imag(val))) { + local_red_val = val; + if constexpr (!First) { + local_idx = inds_[inp_offset]; + } + else { + local_idx = + static_cast(arg_reduce_gid); + } + } + } + else if constexpr (std::is_floating_point_v || + std::is_same_v) { + if (val > local_red_val || std::isnan(val)) { + local_red_val = val; + if constexpr (!First) { + local_idx = inds_[inp_offset]; + } + else { + local_idx = + static_cast(arg_reduce_gid); + } + } + } + else { + if (val > local_red_val) { + local_red_val = val; + if constexpr (!First) { + local_idx = inds_[inp_offset]; + } + else { + local_idx = + static_cast(arg_reduce_gid); + } + } + } + } + } + } + } + + auto work_group = it.get_group(); + // This only works if reduction_op_ is from small set of operators + argT red_val_over_wg = su_ns::custom_reduce_over_group( + work_group, local_mem_, local_red_val, reduction_op_); + + using dpctl::tensor::type_utils::is_complex; + if constexpr (is_complex::value) { + // equality does not hold for NaNs, so check here + local_idx = (red_val_over_wg == local_red_val || + std::isnan(std::real(local_red_val)) || + std::isnan(std::imag(local_red_val))) + ? local_idx + : idx_identity_; + } + else if constexpr (std::is_floating_point_v || + std::is_same_v) { + // equality does not hold for NaNs, so check here + local_idx = + (red_val_over_wg == local_red_val || std::isnan(local_red_val)) + ? local_idx + : idx_identity_; + } + else { + local_idx = + red_val_over_wg == local_red_val ? local_idx : idx_identity_; + } + outT idx_over_wg = sycl::reduce_over_group( + work_group, local_idx, idx_identity_, idx_reduction_op_); + if (work_group.leader()) { + // each group writes to a different memory location + if constexpr (!Last) { + // if not the final reduction, write value corresponding to + // an index to a temporary + vals_[out_iter_offset * n_reduction_groups + + reduction_batch_id] = red_val_over_wg; + } + out_[out_iter_offset * n_reduction_groups + reduction_batch_id] = + idx_over_wg; + } + } +}; + +typedef sycl::event (*search_strided_impl_fn_ptr)( + sycl::queue &, + std::size_t, + std::size_t, + const char *, + char *, + int, + const ssize_t *, + ssize_t, + ssize_t, + int, + const ssize_t *, + ssize_t, + const std::vector &); + +template +class search_seq_strided_krn; + +template +class search_seq_contig_krn; + +template +class search_over_group_krn; + +template +class custom_search_over_group_krn; + +template +class search_empty_krn; + +template +sycl::event + submit_search_reduction(sycl::queue &exec_q, + const argTy *arg, + argTy *arg_tmp, + resTy *res_tmp, + resTy *res, + argTy identity_val, + resTy idx_identity_val, + std::size_t wg, + std::size_t iter_nelems, + std::size_t reduction_nelems, + std::size_t reductions_per_wi, + std::size_t reduction_groups, + const InputOutputIterIndexerT &in_out_iter_indexer, + const ReductionIndexerT &reduction_indexer, + const std::vector &depends) +{ + sycl::event red_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + auto globalRange = sycl::range<1>{iter_nelems * reduction_groups * wg}; + auto localRange = sycl::range<1>{wg}; + auto ndRange = sycl::nd_range<1>(globalRange, localRange); + + if constexpr (can_use_reduce_over_group::value) { + using KernelName = + class search_over_group_krn; + cgh.parallel_for( + ndRange, SearchReduction( + arg, arg_tmp, res_tmp, res, ReductionOpT(), + identity_val, IndexOpT(), idx_identity_val, + in_out_iter_indexer, reduction_indexer, + reduction_nelems, iter_nelems, reductions_per_wi)); + } + else { + using SlmT = sycl::local_accessor; + SlmT local_memory = SlmT(localRange, cgh); + using KernelName = class custom_search_over_group_krn< + argTy, resTy, ReductionOpT, IndexOpT, InputOutputIterIndexerT, + ReductionIndexerT, SlmT, First, Last>; + cgh.parallel_for( + ndRange, + CustomSearchReduction( + arg, arg_tmp, res_tmp, res, ReductionOpT(), identity_val, + IndexOpT(), idx_identity_val, in_out_iter_indexer, + reduction_indexer, local_memory, reduction_nelems, + iter_nelems, reductions_per_wi)); + } + }); + return red_ev; +} + +template +sycl::event search_over_group_temps_strided_impl( + sycl::queue &exec_q, + std::size_t iter_nelems, // number of reductions (num. of rows in a + // matrix when reducing over rows) + std::size_t reduction_nelems, // size of each reduction (length of rows, + // i.e. number of columns) + const char *arg_cp, + char *res_cp, + int iter_nd, + const ssize_t *iter_shape_and_strides, + ssize_t iter_arg_offset, + ssize_t iter_res_offset, + int red_nd, + const ssize_t *reduction_shape_stride, + ssize_t reduction_arg_offset, + const std::vector &depends) +{ + const argTy *arg_tp = reinterpret_cast(arg_cp); + resTy *res_tp = reinterpret_cast(res_cp); + + static constexpr argTy identity_val = + su_ns::Identity::value; + static constexpr resTy idx_identity_val = + su_ns::Identity::value; + + if (reduction_nelems == 0) { + sycl::event res_init_ev = exec_q.submit([&](sycl::handler &cgh) { + using IndexerT = + dpctl::tensor::offset_utils::UnpackedStridedIndexer; + + const ssize_t *const &res_shape = iter_shape_and_strides; + const ssize_t *const &res_strides = + iter_shape_and_strides + 2 * iter_nd; + const IndexerT res_indexer(iter_nd, iter_res_offset, res_shape, + res_strides); + using InitKernelName = + class search_empty_krn; + cgh.depends_on(depends); + + cgh.parallel_for( + sycl::range<1>(iter_nelems), [=](sycl::id<1> id) { + auto res_offset = res_indexer(id[0]); + res_tp[res_offset] = idx_identity_val; + }); + }); + + return res_init_ev; + } + + const sycl::device &d = exec_q.get_device(); + const auto &sg_sizes = d.get_info(); + std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); + + if (reduction_nelems < wg) { + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer; + using ReductionIndexerT = dpctl::tensor::offset_utils::StridedIndexer; + + const InputOutputIterIndexerT in_out_iter_indexer{ + iter_nd, iter_arg_offset, iter_res_offset, iter_shape_and_strides}; + const ReductionIndexerT reduction_indexer{red_nd, reduction_arg_offset, + reduction_shape_stride}; + + sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + cgh.parallel_for>( + sycl::range<1>(iter_nelems), + SequentialSearchReduction( + arg_tp, res_tp, ReductionOpT(), identity_val, IndexOpT(), + idx_identity_val, in_out_iter_indexer, reduction_indexer, + reduction_nelems)); + }); + + return comp_ev; + } + + static constexpr std::size_t preferred_reductions_per_wi = 4; + // prevents running out of resources on CPU + std::size_t max_wg = reduction_detail::get_work_group_size(d); + + std::size_t reductions_per_wi(preferred_reductions_per_wi); + if (reduction_nelems <= preferred_reductions_per_wi * max_wg) { + // Perform reduction using one 1 work-group per iteration, + // can output directly to res + + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer; + using ReductionIndexerT = dpctl::tensor::offset_utils::StridedIndexer; + + const InputOutputIterIndexerT in_out_iter_indexer{ + iter_nd, iter_arg_offset, iter_res_offset, iter_shape_and_strides}; + const ReductionIndexerT reduction_indexer{red_nd, reduction_arg_offset, + reduction_shape_stride}; + + if (iter_nelems == 1) { + // increase GPU occupancy + wg = max_wg; + } + reductions_per_wi = + std::max(1, (reduction_nelems + wg - 1) / wg); + + std::size_t reduction_groups = + (reduction_nelems + reductions_per_wi * wg - 1) / + (reductions_per_wi * wg); + assert(reduction_groups == 1); + + sycl::event comp_ev = + submit_search_reduction( + exec_q, arg_tp, nullptr, nullptr, res_tp, identity_val, + idx_identity_val, wg, iter_nelems, reduction_nelems, + reductions_per_wi, reduction_groups, in_out_iter_indexer, + reduction_indexer, depends); + + return comp_ev; + } + else { + // more than one work-groups is needed, requires a temporary + std::size_t reduction_groups = + (reduction_nelems + preferred_reductions_per_wi * wg - 1) / + (preferred_reductions_per_wi * wg); + assert(reduction_groups > 1); + + std::size_t second_iter_reduction_groups_ = + (reduction_groups + preferred_reductions_per_wi * wg - 1) / + (preferred_reductions_per_wi * wg); + + const std::size_t tmp_alloc_size = + iter_nelems * (reduction_groups + second_iter_reduction_groups_); + auto tmp_owner = dpctl::tensor::alloc_utils::smart_malloc_device( + tmp_alloc_size, exec_q); + + resTy *partially_reduced_tmp = tmp_owner.get(); + resTy *partially_reduced_tmp2 = + partially_reduced_tmp + reduction_groups * iter_nelems; + + auto val_tmp_owner = + dpctl::tensor::alloc_utils::smart_malloc_device( + tmp_alloc_size, exec_q); + + argTy *partially_reduced_vals_tmp = val_tmp_owner.get(); + argTy *partially_reduced_vals_tmp2 = + partially_reduced_vals_tmp + reduction_groups * iter_nelems; + + sycl::event first_reduction_ev; + { + using InputIndexerT = dpctl::tensor::offset_utils::StridedIndexer; + using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + InputIndexerT, ResIndexerT>; + using ReductionIndexerT = + dpctl::tensor::offset_utils::StridedIndexer; + + // Only 2*iter_nd entries describing shape and strides of iterated + // dimensions of input array from iter_shape_and_strides are going + // to be accessed by inp_indexer + const InputIndexerT inp_indexer(iter_nd, iter_arg_offset, + iter_shape_and_strides); + static constexpr ResIndexerT noop_tmp_indexer{}; + + const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer, + noop_tmp_indexer}; + const ReductionIndexerT reduction_indexer{ + red_nd, reduction_arg_offset, reduction_shape_stride}; + + first_reduction_ev = + submit_search_reduction( + exec_q, arg_tp, partially_reduced_vals_tmp, nullptr, + partially_reduced_tmp, identity_val, idx_identity_val, wg, + iter_nelems, reduction_nelems, reductions_per_wi, + reduction_groups, in_out_iter_indexer, reduction_indexer, + depends); + } + + std::size_t remaining_reduction_nelems = reduction_groups; + + resTy *temp_arg = partially_reduced_tmp; + resTy *temp2_arg = partially_reduced_tmp2; + + argTy *vals_temp_arg = partially_reduced_vals_tmp; + argTy *vals_temp2_arg = partially_reduced_vals_tmp2; + + sycl::event dependent_ev = first_reduction_ev; + + while (remaining_reduction_nelems > + preferred_reductions_per_wi * max_wg) { + std::size_t reduction_groups_ = + (remaining_reduction_nelems + preferred_reductions_per_wi * wg - + 1) / + (preferred_reductions_per_wi * wg); + assert(reduction_groups_ > 1); + + // keep reducing + using InputIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer; + using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + InputIndexerT, ResIndexerT>; + using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + + const InputIndexerT inp_indexer{/* size */ iter_nelems, + /* step */ reduction_groups_}; + static constexpr ResIndexerT res_iter_indexer{}; + + const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer, + res_iter_indexer}; + static constexpr ReductionIndexerT reduction_indexer{}; + + sycl::event partial_reduction_ev = + submit_search_reduction( + exec_q, vals_temp_arg, vals_temp2_arg, temp_arg, temp2_arg, + identity_val, idx_identity_val, wg, iter_nelems, + remaining_reduction_nelems, preferred_reductions_per_wi, + reduction_groups_, in_out_iter_indexer, reduction_indexer, + {dependent_ev}); + + remaining_reduction_nelems = reduction_groups_; + std::swap(temp_arg, temp2_arg); + std::swap(vals_temp_arg, vals_temp2_arg); + dependent_ev = partial_reduction_ev; + } + + // final reduction to res + using InputIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer; + using ResIndexerT = dpctl::tensor::offset_utils::UnpackedStridedIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + InputIndexerT, ResIndexerT>; + using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + + const InputIndexerT inp_indexer{/* size */ iter_nelems, + /* step */ remaining_reduction_nelems}; + const ResIndexerT res_iter_indexer{ + iter_nd, iter_res_offset, + /* shape */ iter_shape_and_strides, + /* strides */ iter_shape_and_strides + 2 * iter_nd}; + + const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer, + res_iter_indexer}; + static constexpr ReductionIndexerT reduction_indexer{}; + + wg = max_wg; + reductions_per_wi = std::max( + 1, (remaining_reduction_nelems + wg - 1) / wg); + + reduction_groups = + (remaining_reduction_nelems + reductions_per_wi * wg - 1) / + (reductions_per_wi * wg); + assert(reduction_groups == 1); + + sycl::event final_reduction_ev = + submit_search_reduction( + exec_q, vals_temp_arg, nullptr, temp_arg, res_tp, identity_val, + idx_identity_val, wg, iter_nelems, remaining_reduction_nelems, + reductions_per_wi, reduction_groups, in_out_iter_indexer, + reduction_indexer, {dependent_ev}); + + sycl::event cleanup_host_task_event = + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {final_reduction_ev}, tmp_owner, val_tmp_owner); + + // FIXME: do not return host-task event + // Instead collect all host-tasks to a list + + return cleanup_host_task_event; + } +} + +typedef sycl::event (*search_contig_impl_fn_ptr)( + sycl::queue &, + std::size_t, + std::size_t, + const char *, + char *, + ssize_t, + ssize_t, + ssize_t, + const std::vector &); + +template +sycl::event search_axis1_over_group_temps_contig_impl( + sycl::queue &exec_q, + std::size_t iter_nelems, // number of reductions (num. of rows in a + // matrix when reducing over rows) + std::size_t reduction_nelems, // size of each reduction (length of rows, + // i.e. number of columns) + const char *arg_cp, + char *res_cp, + ssize_t iter_arg_offset, + ssize_t iter_res_offset, + ssize_t reduction_arg_offset, + const std::vector &depends) +{ + const argTy *arg_tp = reinterpret_cast(arg_cp) + + iter_arg_offset + reduction_arg_offset; + resTy *res_tp = reinterpret_cast(res_cp) + iter_res_offset; + + static constexpr argTy identity_val = + su_ns::Identity::value; + static constexpr resTy idx_identity_val = + su_ns::Identity::value; + + if (reduction_nelems == 0) { + sycl::event res_init_ev = exec_q.fill( + res_tp, resTy(idx_identity_val), iter_nelems, depends); + + return res_init_ev; + } + + const sycl::device &d = exec_q.get_device(); + const auto &sg_sizes = d.get_info(); + std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); + + if (reduction_nelems < wg) { + using InputIterIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer; + using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + InputIterIndexerT, NoOpIndexerT>; + using ReductionIndexerT = NoOpIndexerT; + + const InputOutputIterIndexerT in_out_iter_indexer{ + InputIterIndexerT{/* size */ iter_nelems, + /* step */ reduction_nelems}, + NoOpIndexerT{}}; + static constexpr ReductionIndexerT reduction_indexer{}; + + sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + cgh.parallel_for>( + sycl::range<1>(iter_nelems), + SequentialSearchReduction( + arg_tp, res_tp, ReductionOpT(), identity_val, IndexOpT(), + idx_identity_val, in_out_iter_indexer, reduction_indexer, + reduction_nelems)); + }); + + return comp_ev; + } + + static constexpr std::size_t preferred_reductions_per_wi = 8; + // prevents running out of resources on CPU + std::size_t max_wg = reduction_detail::get_work_group_size(d); + + std::size_t reductions_per_wi(preferred_reductions_per_wi); + if (reduction_nelems <= preferred_reductions_per_wi * max_wg) { + // Perform reduction using one 1 work-group per iteration, + // can output directly to res + using InputIterIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer; + using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + InputIterIndexerT, NoOpIndexerT>; + using ReductionIndexerT = NoOpIndexerT; + + const InputOutputIterIndexerT in_out_iter_indexer{ + InputIterIndexerT{/* size */ iter_nelems, + /* step */ reduction_nelems}, + NoOpIndexerT{}}; + static constexpr ReductionIndexerT reduction_indexer{}; + + if (iter_nelems == 1) { + // increase GPU occupancy + wg = max_wg; + } + reductions_per_wi = + std::max(1, (reduction_nelems + wg - 1) / wg); + + std::size_t reduction_groups = + (reduction_nelems + reductions_per_wi * wg - 1) / + (reductions_per_wi * wg); + assert(reduction_groups == 1); + + sycl::event comp_ev = + submit_search_reduction( + exec_q, arg_tp, nullptr, nullptr, res_tp, identity_val, + idx_identity_val, wg, iter_nelems, reduction_nelems, + reductions_per_wi, reduction_groups, in_out_iter_indexer, + reduction_indexer, depends); + + return comp_ev; + } + else { + // more than one work-groups is needed, requires a temporary + std::size_t reduction_groups = + (reduction_nelems + preferred_reductions_per_wi * wg - 1) / + (preferred_reductions_per_wi * wg); + assert(reduction_groups > 1); + + std::size_t second_iter_reduction_groups_ = + (reduction_groups + preferred_reductions_per_wi * wg - 1) / + (preferred_reductions_per_wi * wg); + + const std::size_t tmp_alloc_size = + iter_nelems * (reduction_groups + second_iter_reduction_groups_); + auto tmp_owner = dpctl::tensor::alloc_utils::smart_malloc_device( + tmp_alloc_size, exec_q); + resTy *partially_reduced_tmp = tmp_owner.get(); + resTy *partially_reduced_tmp2 = + partially_reduced_tmp + reduction_groups * iter_nelems; + + auto val_tmp_owner = + dpctl::tensor::alloc_utils::smart_malloc_device( + tmp_alloc_size, exec_q); + argTy *partially_reduced_vals_tmp = val_tmp_owner.get(); + argTy *partially_reduced_vals_tmp2 = + partially_reduced_vals_tmp + reduction_groups * iter_nelems; + + sycl::event first_reduction_ev; + { + using InputIterIndexerT = + dpctl::tensor::offset_utils::Strided1DIndexer; + using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + InputIterIndexerT, NoOpIndexerT>; + using ReductionIndexerT = NoOpIndexerT; + + const InputOutputIterIndexerT in_out_iter_indexer{ + InputIterIndexerT{/* size */ iter_nelems, + /* step */ reduction_nelems}, + NoOpIndexerT{}}; + static constexpr ReductionIndexerT reduction_indexer{}; + + first_reduction_ev = + submit_search_reduction( + exec_q, arg_tp, partially_reduced_vals_tmp, nullptr, + partially_reduced_tmp, identity_val, idx_identity_val, wg, + iter_nelems, reduction_nelems, preferred_reductions_per_wi, + reduction_groups, in_out_iter_indexer, reduction_indexer, + depends); + } + + std::size_t remaining_reduction_nelems = reduction_groups; + + resTy *temp_arg = partially_reduced_tmp; + resTy *temp2_arg = partially_reduced_tmp2; + + argTy *vals_temp_arg = partially_reduced_vals_tmp; + argTy *vals_temp2_arg = partially_reduced_vals_tmp2; + + sycl::event dependent_ev = first_reduction_ev; + + while (remaining_reduction_nelems > + preferred_reductions_per_wi * max_wg) { + std::size_t reduction_groups_ = + (remaining_reduction_nelems + preferred_reductions_per_wi * wg - + 1) / + (preferred_reductions_per_wi * wg); + assert(reduction_groups_ > 1); + + // keep reducing + using InputIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer; + using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + InputIndexerT, ResIndexerT>; + using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + + const InputIndexerT inp_indexer{/* size */ iter_nelems, + /* step */ reduction_groups_}; + static constexpr ResIndexerT res_iter_indexer{}; + + const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer, + res_iter_indexer}; + static constexpr ReductionIndexerT reduction_indexer{}; + + sycl::event partial_reduction_ev = + submit_search_reduction( + exec_q, vals_temp_arg, vals_temp2_arg, temp_arg, temp2_arg, + identity_val, idx_identity_val, wg, iter_nelems, + remaining_reduction_nelems, preferred_reductions_per_wi, + reduction_groups_, in_out_iter_indexer, reduction_indexer, + {dependent_ev}); + + remaining_reduction_nelems = reduction_groups_; + std::swap(temp_arg, temp2_arg); + std::swap(vals_temp_arg, vals_temp2_arg); + dependent_ev = partial_reduction_ev; + } + + // final reduction to res + using InputIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer; + using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + InputIndexerT, ResIndexerT>; + using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + + const InputIndexerT inp_indexer{/* size */ iter_nelems, + /* step */ remaining_reduction_nelems}; + static constexpr ResIndexerT res_iter_indexer{}; + + const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer, + res_iter_indexer}; + static constexpr ReductionIndexerT reduction_indexer{}; + + wg = max_wg; + reductions_per_wi = std::max( + 1, (remaining_reduction_nelems + wg - 1) / wg); + + reduction_groups = + (remaining_reduction_nelems + reductions_per_wi * wg - 1) / + (reductions_per_wi * wg); + assert(reduction_groups == 1); + + sycl::event final_reduction_ev = + submit_search_reduction( + exec_q, vals_temp_arg, nullptr, temp_arg, res_tp, identity_val, + idx_identity_val, wg, iter_nelems, remaining_reduction_nelems, + reductions_per_wi, reduction_groups, in_out_iter_indexer, + reduction_indexer, {dependent_ev}); + + sycl::event cleanup_host_task_event = + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {final_reduction_ev}, tmp_owner, val_tmp_owner); + + // FIXME: do not return host-task event + // Instead collect all host-tasks to a list + + return cleanup_host_task_event; + } +} + +template +sycl::event search_axis0_over_group_temps_contig_impl( + sycl::queue &exec_q, + std::size_t iter_nelems, // number of reductions (num. of rows in a + // matrix when reducing over rows) + std::size_t reduction_nelems, // size of each reduction (length of rows, + // i.e. number of columns) + const char *arg_cp, + char *res_cp, + ssize_t iter_arg_offset, + ssize_t iter_res_offset, + ssize_t reduction_arg_offset, + const std::vector &depends) +{ + const argTy *arg_tp = reinterpret_cast(arg_cp) + + iter_arg_offset + reduction_arg_offset; + resTy *res_tp = reinterpret_cast(res_cp) + iter_res_offset; + + static constexpr argTy identity_val = + su_ns::Identity::value; + static constexpr resTy idx_identity_val = + su_ns::Identity::value; + + if (reduction_nelems == 0) { + sycl::event res_init_ev = exec_q.fill( + res_tp, resTy(idx_identity_val), iter_nelems, depends); + + return res_init_ev; + } + + const sycl::device &d = exec_q.get_device(); + const auto &sg_sizes = d.get_info(); + std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); + + if (reduction_nelems < wg) { + using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + NoOpIndexerT, NoOpIndexerT>; + using ReductionIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer; + + const InputOutputIterIndexerT in_out_iter_indexer{NoOpIndexerT{}, + NoOpIndexerT{}}; + const ReductionIndexerT reduction_indexer{/* size */ reduction_nelems, + /* step */ iter_nelems}; + + sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + using KernelName = + class search_seq_contig_krn; + + sycl::range<1> iter_range{iter_nelems}; + + cgh.parallel_for( + iter_range, + SequentialSearchReduction( + arg_tp, res_tp, ReductionOpT(), identity_val, IndexOpT(), + idx_identity_val, in_out_iter_indexer, reduction_indexer, + reduction_nelems)); + }); + + return comp_ev; + } + + static constexpr std::size_t preferred_reductions_per_wi = 8; + // prevents running out of resources on CPU + std::size_t max_wg = reduction_detail::get_work_group_size(d); + + std::size_t reductions_per_wi(preferred_reductions_per_wi); + if (reduction_nelems <= preferred_reductions_per_wi * max_wg) { + // Perform reduction using one 1 work-group per iteration, + // can output directly to res + using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using ColsIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + NoOpIndexerT, NoOpIndexerT>; + using ReductionIndexerT = ColsIndexerT; + + static constexpr NoOpIndexerT columns_indexer{}; + static constexpr NoOpIndexerT result_indexer{}; + const InputOutputIterIndexerT in_out_iter_indexer{columns_indexer, + result_indexer}; + const ReductionIndexerT reduction_indexer{/* size */ reduction_nelems, + /* step */ iter_nelems}; + + if (iter_nelems == 1) { + // increase GPU occupancy + wg = max_wg; + } + reductions_per_wi = + std::max(1, (reduction_nelems + wg - 1) / wg); + + std::size_t reduction_groups = + (reduction_nelems + reductions_per_wi * wg - 1) / + (reductions_per_wi * wg); + assert(reduction_groups == 1); + + sycl::event comp_ev = + submit_search_reduction( + exec_q, arg_tp, nullptr, nullptr, res_tp, identity_val, + idx_identity_val, wg, iter_nelems, reduction_nelems, + reductions_per_wi, reduction_groups, in_out_iter_indexer, + reduction_indexer, depends); + + return comp_ev; + } + else { + // more than one work-groups is needed, requires a temporary + std::size_t reduction_groups = + (reduction_nelems + preferred_reductions_per_wi * wg - 1) / + (preferred_reductions_per_wi * wg); + assert(reduction_groups > 1); + + std::size_t second_iter_reduction_groups_ = + (reduction_groups + preferred_reductions_per_wi * wg - 1) / + (preferred_reductions_per_wi * wg); + + const std::size_t tmp_alloc_size = + iter_nelems * (reduction_groups + second_iter_reduction_groups_); + auto tmp_owner = dpctl::tensor::alloc_utils::smart_malloc_device( + tmp_alloc_size, exec_q); + + resTy *partially_reduced_tmp = tmp_owner.get(); + resTy *partially_reduced_tmp2 = + partially_reduced_tmp + reduction_groups * iter_nelems; + + auto vals_tmp_owner = + dpctl::tensor::alloc_utils::smart_malloc_device( + tmp_alloc_size, exec_q); + argTy *partially_reduced_vals_tmp = vals_tmp_owner.get(); + argTy *partially_reduced_vals_tmp2 = + partially_reduced_vals_tmp + reduction_groups * iter_nelems; + + sycl::event first_reduction_ev; + { + using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using ColsIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + NoOpIndexerT, NoOpIndexerT>; + using ReductionIndexerT = ColsIndexerT; + + static constexpr NoOpIndexerT columns_indexer{}; + static constexpr NoOpIndexerT result_indexer{}; + const InputOutputIterIndexerT in_out_iter_indexer{columns_indexer, + result_indexer}; + const ReductionIndexerT reduction_indexer{ + /* size */ reduction_nelems, + /* step */ iter_nelems}; + + first_reduction_ev = + submit_search_reduction( + exec_q, arg_tp, partially_reduced_vals_tmp, nullptr, + partially_reduced_tmp, identity_val, idx_identity_val, wg, + iter_nelems, reduction_nelems, preferred_reductions_per_wi, + reduction_groups, in_out_iter_indexer, reduction_indexer, + depends); + } + + std::size_t remaining_reduction_nelems = reduction_groups; + + resTy *temp_arg = partially_reduced_tmp; + resTy *temp2_arg = partially_reduced_tmp2; + + argTy *vals_temp_arg = partially_reduced_vals_tmp; + argTy *vals_temp2_arg = partially_reduced_vals_tmp2; + + sycl::event dependent_ev = first_reduction_ev; + + while (remaining_reduction_nelems > + preferred_reductions_per_wi * max_wg) { + std::size_t reduction_groups_ = + (remaining_reduction_nelems + preferred_reductions_per_wi * wg - + 1) / + (preferred_reductions_per_wi * wg); + assert(reduction_groups_ > 1); + + // keep reducing + using InputIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer; + using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + InputIndexerT, ResIndexerT>; + using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + + const InputIndexerT inp_indexer{/* size */ iter_nelems, + /* step */ reduction_groups_}; + static constexpr ResIndexerT res_iter_indexer{}; + + const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer, + res_iter_indexer}; + static constexpr ReductionIndexerT reduction_indexer{}; + + sycl::event partial_reduction_ev = + submit_search_reduction( + exec_q, vals_temp_arg, vals_temp2_arg, temp_arg, temp2_arg, + identity_val, idx_identity_val, wg, iter_nelems, + remaining_reduction_nelems, preferred_reductions_per_wi, + reduction_groups_, in_out_iter_indexer, reduction_indexer, + {dependent_ev}); + + remaining_reduction_nelems = reduction_groups_; + std::swap(temp_arg, temp2_arg); + std::swap(vals_temp_arg, vals_temp2_arg); + dependent_ev = partial_reduction_ev; + } + + // final reduction to res + using InputIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer; + using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + InputIndexerT, ResIndexerT>; + using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + + const InputIndexerT inp_indexer{/* size */ iter_nelems, + /* step */ remaining_reduction_nelems}; + static constexpr ResIndexerT res_iter_indexer{}; + + const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer, + res_iter_indexer}; + static constexpr ReductionIndexerT reduction_indexer{}; + + wg = max_wg; + reductions_per_wi = std::max( + 1, (remaining_reduction_nelems + wg - 1) / wg); + + reduction_groups = + (remaining_reduction_nelems + reductions_per_wi * wg - 1) / + (reductions_per_wi * wg); + assert(reduction_groups == 1); + + sycl::event final_reduction_ev = + submit_search_reduction( + exec_q, vals_temp_arg, nullptr, temp_arg, res_tp, identity_val, + idx_identity_val, wg, iter_nelems, remaining_reduction_nelems, + reductions_per_wi, reduction_groups, in_out_iter_indexer, + reduction_indexer, {dependent_ev}); + + sycl::event cleanup_host_task_event = + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {final_reduction_ev}, tmp_owner, vals_tmp_owner); + + // FIXME: do not return host-task event + // Instead collect all host-tasks to a list + + return cleanup_host_task_event; + } +} + +} // namespace dpctl::tensor::kernels diff --git a/dpnp/tensor/libtensor/include/kernels/repeat.hpp b/dpnp/tensor/libtensor/include/kernels/repeat.hpp new file mode 100644 index 000000000000..83a520adb538 --- /dev/null +++ b/dpnp/tensor/libtensor/include/kernels/repeat.hpp @@ -0,0 +1,460 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for tensor repeating operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include +#include +#include + +#include + +#include "dpctl_tensor_types.hpp" +#include "utils/offset_utils.hpp" + +namespace dpctl::tensor::kernels::repeat +{ + +using dpctl::tensor::ssize_t; +using namespace dpctl::tensor::offset_utils; + +template +class repeat_by_sequence_kernel; + +template +class RepeatSequenceFunctor +{ +private: + const T *src = nullptr; + T *dst = nullptr; + const repT *reps = nullptr; + const repT *cumsum = nullptr; + std::size_t src_axis_nelems = 1; + OrthogIndexer orthog_strider; + SrcAxisIndexer src_axis_strider; + DstAxisIndexer dst_axis_strider; + RepIndexer reps_strider; + +public: + RepeatSequenceFunctor(const T *src_, + T *dst_, + const repT *reps_, + const repT *cumsum_, + std::size_t src_axis_nelems_, + const OrthogIndexer &orthog_strider_, + const SrcAxisIndexer &src_axis_strider_, + const DstAxisIndexer &dst_axis_strider_, + const RepIndexer &reps_strider_) + : src(src_), dst(dst_), reps(reps_), cumsum(cumsum_), + src_axis_nelems(src_axis_nelems_), orthog_strider(orthog_strider_), + src_axis_strider(src_axis_strider_), + dst_axis_strider(dst_axis_strider_), reps_strider(reps_strider_) + { + } + + void operator()(sycl::id<1> idx) const + { + std::size_t id = idx[0]; + auto i_orthog = id / src_axis_nelems; + auto i_along = id - (i_orthog * src_axis_nelems); + + auto orthog_offsets = orthog_strider(i_orthog); + auto src_offset = orthog_offsets.get_first_offset(); + auto dst_offset = orthog_offsets.get_second_offset(); + + auto val = src[src_offset + src_axis_strider(i_along)]; + auto last = cumsum[i_along]; + auto first = last - reps[reps_strider(i_along)]; + for (auto i = first; i < last; ++i) { + dst[dst_offset + dst_axis_strider(i)] = val; + } + } +}; + +typedef sycl::event (*repeat_by_sequence_fn_ptr_t)( + sycl::queue &, + std::size_t, + std::size_t, + const char *, + char *, + const char *, + const char *, + int, + const ssize_t *, + ssize_t, + ssize_t, + ssize_t, + ssize_t, + ssize_t, + ssize_t, + ssize_t, + ssize_t, + const std::vector &); + +template +sycl::event + repeat_by_sequence_impl(sycl::queue &q, + std::size_t orthog_nelems, + std::size_t src_axis_nelems, + const char *src_cp, + char *dst_cp, + const char *reps_cp, + const char *cumsum_cp, + int orthog_nd, + const ssize_t *orthog_src_dst_shape_and_strides, + ssize_t src_offset, + ssize_t dst_offset, + ssize_t src_axis_shape, + ssize_t src_axis_stride, + ssize_t dst_axis_shape, + ssize_t dst_axis_stride, + ssize_t reps_shape, + ssize_t reps_stride, + const std::vector &depends) +{ + sycl::event repeat_ev = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + const T *src_tp = reinterpret_cast(src_cp); + const repT *reps_tp = reinterpret_cast(reps_cp); + const repT *cumsum_tp = reinterpret_cast(cumsum_cp); + T *dst_tp = reinterpret_cast(dst_cp); + + // orthog ndim indexer + const TwoOffsets_StridedIndexer orthog_indexer{ + orthog_nd, src_offset, dst_offset, + orthog_src_dst_shape_and_strides}; + // indexers along repeated axis + const Strided1DIndexer src_axis_indexer{/* size */ src_axis_shape, + /* step */ src_axis_stride}; + const Strided1DIndexer dst_axis_indexer{/* size */ dst_axis_shape, + /* step */ dst_axis_stride}; + // indexer along reps array + const Strided1DIndexer reps_indexer{/* size */ reps_shape, + /* step */ reps_stride}; + + const std::size_t gws = orthog_nelems * src_axis_nelems; + + cgh.parallel_for>( + sycl::range<1>(gws), + RepeatSequenceFunctor( + src_tp, dst_tp, reps_tp, cumsum_tp, src_axis_nelems, + orthog_indexer, src_axis_indexer, dst_axis_indexer, + reps_indexer)); + }); + + return repeat_ev; +} + +template +struct RepeatSequenceFactory +{ + fnT get() + { + fnT fn = repeat_by_sequence_impl; + return fn; + } +}; + +typedef sycl::event (*repeat_by_sequence_1d_fn_ptr_t)( + sycl::queue &, + std::size_t, + const char *, + char *, + const char *, + const char *, + int, + const ssize_t *, + ssize_t, + ssize_t, + ssize_t, + ssize_t, + const std::vector &); + +template +sycl::event repeat_by_sequence_1d_impl(sycl::queue &q, + std::size_t src_nelems, + const char *src_cp, + char *dst_cp, + const char *reps_cp, + const char *cumsum_cp, + int src_nd, + const ssize_t *src_shape_strides, + ssize_t dst_shape, + ssize_t dst_stride, + ssize_t reps_shape, + ssize_t reps_stride, + const std::vector &depends) +{ + sycl::event repeat_ev = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + const T *src_tp = reinterpret_cast(src_cp); + const repT *reps_tp = reinterpret_cast(reps_cp); + const repT *cumsum_tp = reinterpret_cast(cumsum_cp); + T *dst_tp = reinterpret_cast(dst_cp); + + // orthog ndim indexer + static constexpr TwoZeroOffsets_Indexer orthog_indexer{}; + // indexers along repeated axis + const StridedIndexer src_indexer{src_nd, 0, src_shape_strides}; + const Strided1DIndexer dst_indexer{/* size */ dst_shape, + /* step */ dst_stride}; + // indexer along reps array + const Strided1DIndexer reps_indexer{/* size */ reps_shape, + /* step */ reps_stride}; + + const std::size_t gws = src_nelems; + + cgh.parallel_for>( + sycl::range<1>(gws), + RepeatSequenceFunctor( + src_tp, dst_tp, reps_tp, cumsum_tp, src_nelems, orthog_indexer, + src_indexer, dst_indexer, reps_indexer)); + }); + + return repeat_ev; +} + +template +struct RepeatSequence1DFactory +{ + fnT get() + { + fnT fn = repeat_by_sequence_1d_impl; + return fn; + } +}; + +template +class repeat_by_scalar_kernel; + +template +class RepeatScalarFunctor +{ +private: + const T *src = nullptr; + T *dst = nullptr; + ssize_t reps = 1; + std::size_t dst_axis_nelems = 0; + OrthogIndexer orthog_strider; + SrcAxisIndexer src_axis_strider; + DstAxisIndexer dst_axis_strider; + +public: + RepeatScalarFunctor(const T *src_, + T *dst_, + const ssize_t reps_, + std::size_t dst_axis_nelems_, + const OrthogIndexer &orthog_strider_, + const SrcAxisIndexer &src_axis_strider_, + const DstAxisIndexer &dst_axis_strider_) + : src(src_), dst(dst_), reps(reps_), dst_axis_nelems(dst_axis_nelems_), + orthog_strider(orthog_strider_), src_axis_strider(src_axis_strider_), + dst_axis_strider(dst_axis_strider_) + { + } + + void operator()(sycl::id<1> idx) const + { + std::size_t id = idx[0]; + auto i_orthog = id / dst_axis_nelems; + auto i_along = id - (i_orthog * dst_axis_nelems); + + auto orthog_offsets = orthog_strider(i_orthog); + auto src_offset = orthog_offsets.get_first_offset(); + auto dst_offset = orthog_offsets.get_second_offset(); + + auto dst_axis_offset = dst_axis_strider(i_along); + auto src_axis_offset = src_axis_strider(i_along / reps); + dst[dst_offset + dst_axis_offset] = src[src_offset + src_axis_offset]; + } +}; + +typedef sycl::event (*repeat_by_scalar_fn_ptr_t)( + sycl::queue &, + std::size_t, + std::size_t, + const char *, + char *, + const ssize_t, + int, + const ssize_t *, + ssize_t, + ssize_t, + ssize_t, + ssize_t, + ssize_t, + ssize_t, + const std::vector &); + +template +sycl::event repeat_by_scalar_impl(sycl::queue &q, + std::size_t orthog_nelems, + std::size_t dst_axis_nelems, + const char *src_cp, + char *dst_cp, + const ssize_t reps, + int orthog_nd, + const ssize_t *orthog_shape_and_strides, + ssize_t src_offset, + ssize_t dst_offset, + ssize_t src_axis_shape, + ssize_t src_axis_stride, + ssize_t dst_axis_shape, + ssize_t dst_axis_stride, + const std::vector &depends) +{ + sycl::event repeat_ev = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + const T *src_tp = reinterpret_cast(src_cp); + T *dst_tp = reinterpret_cast(dst_cp); + + // orthog ndim indexer + const TwoOffsets_StridedIndexer orthog_indexer{ + orthog_nd, src_offset, dst_offset, orthog_shape_and_strides}; + // indexers along repeated axis + const Strided1DIndexer src_axis_indexer{/* size */ src_axis_shape, + /* step */ src_axis_stride}; + const Strided1DIndexer dst_axis_indexer{/* size */ dst_axis_shape, + /* step */ dst_axis_stride}; + + const std::size_t gws = orthog_nelems * dst_axis_nelems; + + cgh.parallel_for>( + sycl::range<1>(gws), + RepeatScalarFunctor( + src_tp, dst_tp, reps, dst_axis_nelems, orthog_indexer, + src_axis_indexer, dst_axis_indexer)); + }); + + return repeat_ev; +} + +template +struct RepeatScalarFactory +{ + fnT get() + { + fnT fn = repeat_by_scalar_impl; + return fn; + } +}; + +typedef sycl::event (*repeat_by_scalar_1d_fn_ptr_t)( + sycl::queue &, + std::size_t, + const char *, + char *, + const ssize_t, + int, + const ssize_t *, + ssize_t, + ssize_t, + const std::vector &); + +template +sycl::event repeat_by_scalar_1d_impl(sycl::queue &q, + std::size_t dst_nelems, + const char *src_cp, + char *dst_cp, + const ssize_t reps, + int src_nd, + const ssize_t *src_shape_strides, + ssize_t dst_shape, + ssize_t dst_stride, + const std::vector &depends) +{ + sycl::event repeat_ev = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + const T *src_tp = reinterpret_cast(src_cp); + T *dst_tp = reinterpret_cast(dst_cp); + + // orthog ndim indexer + static constexpr TwoZeroOffsets_Indexer orthog_indexer{}; + // indexers along repeated axis + const StridedIndexer src_indexer(src_nd, 0, src_shape_strides); + const Strided1DIndexer dst_indexer{/* size */ dst_shape, + /* step */ dst_stride}; + + const std::size_t gws = dst_nelems; + + cgh.parallel_for>( + sycl::range<1>(gws), + RepeatScalarFunctor(src_tp, dst_tp, reps, + dst_nelems, orthog_indexer, + src_indexer, dst_indexer)); + }); + + return repeat_ev; +} + +template +struct RepeatScalar1DFactory +{ + fnT get() + { + fnT fn = repeat_by_scalar_1d_impl; + return fn; + } +}; + +} // namespace dpctl::tensor::kernels::repeat diff --git a/dpnp/tensor/libtensor/include/kernels/sorting/isin.hpp b/dpnp/tensor/libtensor/include/kernels/sorting/isin.hpp new file mode 100644 index 000000000000..847fa96ecdff --- /dev/null +++ b/dpnp/tensor/libtensor/include/kernels/sorting/isin.hpp @@ -0,0 +1,245 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for tensor membership operations. +//===----------------------------------------------------------------------===// + +#pragma once + +#include +#include + +#include + +#include "kernels/dpctl_tensor_types.hpp" +#include "kernels/sorting/search_sorted_detail.hpp" +#include "utils/offset_utils.hpp" +#include "utils/rich_comparisons.hpp" + +namespace dpctl::tensor::kernels +{ + +using dpctl::tensor::ssize_t; + +template +struct IsinFunctor +{ +private: + bool invert; + const T *hay_tp; + const T *needles_tp; + bool *out_tp; + std::size_t hay_nelems; + HayIndexerT hay_indexer; + NeedlesIndexerT needles_indexer; + OutIndexerT out_indexer; + +public: + IsinFunctor(const bool invert_, + const T *hay_, + const T *needles_, + bool *out_, + const std::size_t hay_nelems_, + const HayIndexerT &hay_indexer_, + const NeedlesIndexerT &needles_indexer_, + const OutIndexerT &out_indexer_) + : invert(invert_), hay_tp(hay_), needles_tp(needles_), out_tp(out_), + hay_nelems(hay_nelems_), hay_indexer(hay_indexer_), + needles_indexer(needles_indexer_), out_indexer(out_indexer_) + { + } + + void operator()(sycl::id<1> id) const + { + using Compare = + typename dpctl::tensor::rich_comparisons::AscendingSorter::type; + static constexpr Compare comp{}; + + const std::size_t i = id[0]; + const T needle_v = needles_tp[needles_indexer(i)]; + + // position of the needle_v in the hay array + std::size_t pos{}; + + static constexpr std::size_t zero(0); + // search in hay in left-closed interval, give `pos` such that + // hay[pos - 1] < needle_v <= hay[pos] + + // lower_bound returns the first pos such that bool(hay[pos] < + // needle_v) is false, i.e. needle_v <= hay[pos] + pos = search_sorted_detail::lower_bound_indexed_impl( + hay_tp, zero, hay_nelems, needle_v, comp, hay_indexer); + bool out = (pos == hay_nelems ? false : hay_tp[pos] == needle_v); + out_tp[out_indexer(i)] = (invert) ? !out : out; + } +}; + +typedef sycl::event (*isin_contig_impl_fp_ptr_t)( + sycl::queue &, + const bool, + const std::size_t, + const std::size_t, + const char *, + const ssize_t, + const char *, + const ssize_t, + char *, + const ssize_t, + const std::vector &); + +template +class isin_contig_impl_krn; + +template +sycl::event isin_contig_impl(sycl::queue &exec_q, + const bool invert, + const std::size_t hay_nelems, + const std::size_t needles_nelems, + const char *hay_cp, + const ssize_t hay_offset, + const char *needles_cp, + const ssize_t needles_offset, + char *out_cp, + const ssize_t out_offset, + const std::vector &depends) +{ + const T *hay_tp = reinterpret_cast(hay_cp) + hay_offset; + const T *needles_tp = + reinterpret_cast(needles_cp) + needles_offset; + + bool *out_tp = reinterpret_cast(out_cp) + out_offset; + + sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + using KernelName = class isin_contig_impl_krn; + + sycl::range<1> gRange(needles_nelems); + + using TrivialIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + + static constexpr TrivialIndexerT hay_indexer{}; + static constexpr TrivialIndexerT needles_indexer{}; + static constexpr TrivialIndexerT out_indexer{}; + + const auto fnctr = + IsinFunctor( + invert, hay_tp, needles_tp, out_tp, hay_nelems, hay_indexer, + needles_indexer, out_indexer); + + cgh.parallel_for(gRange, fnctr); + }); + + return comp_ev; +} + +typedef sycl::event (*isin_strided_impl_fp_ptr_t)( + sycl::queue &, + const bool, + const std::size_t, + const std::size_t, + const char *, + const ssize_t, + const ssize_t, + const char *, + const ssize_t, + char *, + const ssize_t, + int, + const ssize_t *, + const std::vector &); + +template +class isin_strided_impl_krn; + +template +sycl::event isin_strided_impl( + sycl::queue &exec_q, + const bool invert, + const std::size_t hay_nelems, + const std::size_t needles_nelems, + const char *hay_cp, + const ssize_t hay_offset, + // hay is 1D, so hay_nelems, hay_offset, hay_stride describe strided array + const ssize_t hay_stride, + const char *needles_cp, + const ssize_t needles_offset, + char *out_cp, + const ssize_t out_offset, + const int needles_nd, + // packed_shape_strides is [needles_shape, needles_strides, + // out_strides] has length of 3*needles_nd + const ssize_t *packed_shape_strides, + const std::vector &depends) +{ + const T *hay_tp = reinterpret_cast(hay_cp); + const T *needles_tp = reinterpret_cast(needles_cp); + + bool *out_tp = reinterpret_cast(out_cp); + + sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + sycl::range<1> gRange(needles_nelems); + + using HayIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer; + const HayIndexerT hay_indexer( + /* offset */ hay_offset, + /* size */ hay_nelems, + /* step */ hay_stride); + + using NeedlesIndexerT = dpctl::tensor::offset_utils::StridedIndexer; + const ssize_t *needles_shape_strides = packed_shape_strides; + const NeedlesIndexerT needles_indexer(needles_nd, needles_offset, + needles_shape_strides); + using OutIndexerT = dpctl::tensor::offset_utils::UnpackedStridedIndexer; + + const ssize_t *out_shape = packed_shape_strides; + const ssize_t *out_strides = packed_shape_strides + 2 * needles_nd; + const OutIndexerT out_indexer(needles_nd, out_offset, out_shape, + out_strides); + + const auto fnctr = + IsinFunctor( + invert, hay_tp, needles_tp, out_tp, hay_nelems, hay_indexer, + needles_indexer, out_indexer); + using KernelName = class isin_strided_impl_krn; + + cgh.parallel_for(gRange, fnctr); + }); + + return comp_ev; +} + +} // namespace dpctl::tensor::kernels diff --git a/dpnp/tensor/libtensor/include/kernels/sorting/merge_sort.hpp b/dpnp/tensor/libtensor/include/kernels/sorting/merge_sort.hpp new file mode 100644 index 000000000000..75d3dc5f01a0 --- /dev/null +++ b/dpnp/tensor/libtensor/include/kernels/sorting/merge_sort.hpp @@ -0,0 +1,844 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for tensor sort/argsort operations. +//===----------------------------------------------------------------------===// + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "kernels/dpctl_tensor_types.hpp" +#include "kernels/sorting/search_sorted_detail.hpp" +#include "kernels/sorting/sort_utils.hpp" + +namespace dpctl::tensor::kernels +{ + +namespace merge_sort_detail +{ + +using dpctl::tensor::ssize_t; +using namespace dpctl::tensor::kernels::search_sorted_detail; + +/*! @brief Merge two contiguous sorted segments */ +template +void merge_impl(const std::size_t offset, + const InAcc in_acc, + OutAcc out_acc, + const std::size_t start_1, + const std::size_t end_1, + const std::size_t end_2, + const std::size_t start_out, + Compare comp, + const std::size_t chunk) +{ + const std::size_t start_2 = end_1; + // Borders of the sequences to merge within this call + const std::size_t local_start_1 = sycl::min(offset + start_1, end_1); + const std::size_t local_end_1 = sycl::min(local_start_1 + chunk, end_1); + const std::size_t local_start_2 = sycl::min(offset + start_2, end_2); + const std::size_t local_end_2 = sycl::min(local_start_2 + chunk, end_2); + + const std::size_t local_size_1 = local_end_1 - local_start_1; + const std::size_t local_size_2 = local_end_2 - local_start_2; + + const auto r_item_1 = in_acc[end_1 - 1]; + const auto l_item_2 = (start_2 < end_2) ? in_acc[start_2] : r_item_1; + + // Copy if the sequences are sorted with respect to each other or merge + // otherwise + if (!comp(l_item_2, r_item_1)) { + const std::size_t out_shift_1 = start_out + local_start_1 - start_1; + const std::size_t out_shift_2 = + start_out + end_1 - start_1 + local_start_2 - start_2; + + for (std::size_t i = 0; i < local_size_1; ++i) { + out_acc[out_shift_1 + i] = in_acc[local_start_1 + i]; + } + for (std::size_t i = 0; i < local_size_2; ++i) { + out_acc[out_shift_2 + i] = in_acc[local_start_2 + i]; + } + } + else if (comp(r_item_1, l_item_2)) { + const std::size_t out_shift_1 = + start_out + end_2 - start_2 + local_start_1 - start_1; + const std::size_t out_shift_2 = start_out + local_start_2 - start_2; + for (std::size_t i = 0; i < local_size_1; ++i) { + out_acc[out_shift_1 + i] = in_acc[local_start_1 + i]; + } + for (std::size_t i = 0; i < local_size_2; ++i) { + out_acc[out_shift_2 + i] = in_acc[local_start_2 + i]; + } + } + // Perform merging + else { + + // Process 1st sequence + if (local_start_1 < local_end_1) { + // Reduce the range for searching within the 2nd sequence and handle + // bound items find left border in 2nd sequence + const auto local_l_item_1 = in_acc[local_start_1]; + std::size_t l_search_bound_2 = + lower_bound_impl(in_acc, start_2, end_2, local_l_item_1, comp); + const std::size_t l_shift_1 = local_start_1 - start_1; + const std::size_t l_shift_2 = l_search_bound_2 - start_2; + + out_acc[start_out + l_shift_1 + l_shift_2] = local_l_item_1; + + std::size_t r_search_bound_2{}; + // find right border in 2nd sequence + if (local_size_1 > 1) { + const auto local_r_item_1 = in_acc[local_end_1 - 1]; + r_search_bound_2 = lower_bound_impl( + in_acc, l_search_bound_2, end_2, local_r_item_1, comp); + const auto r_shift_1 = local_end_1 - 1 - start_1; + const auto r_shift_2 = r_search_bound_2 - start_2; + + out_acc[start_out + r_shift_1 + r_shift_2] = local_r_item_1; + } + + // Handle intermediate items + if (r_search_bound_2 == l_search_bound_2) { + const std::size_t shift_2 = l_search_bound_2 - start_2; + for (std::size_t idx = local_start_1 + 1; idx < local_end_1 - 1; + ++idx) { + const auto intermediate_item_1 = in_acc[idx]; + const std::size_t shift_1 = idx - start_1; + out_acc[start_out + shift_1 + shift_2] = + intermediate_item_1; + } + } + else { + for (std::size_t idx = local_start_1 + 1; idx < local_end_1 - 1; + ++idx) { + const auto intermediate_item_1 = in_acc[idx]; + // we shouldn't seek in whole 2nd sequence. Just for the + // part where the 1st sequence should be + l_search_bound_2 = lower_bound_impl( + in_acc, l_search_bound_2, r_search_bound_2, + intermediate_item_1, comp); + const std::size_t shift_1 = idx - start_1; + const std::size_t shift_2 = l_search_bound_2 - start_2; + + out_acc[start_out + shift_1 + shift_2] = + intermediate_item_1; + } + } + } + // Process 2nd sequence + if (local_start_2 < local_end_2) { + // Reduce the range for searching within the 1st sequence and handle + // bound items find left border in 1st sequence + const auto local_l_item_2 = in_acc[local_start_2]; + std::size_t l_search_bound_1 = + upper_bound_impl(in_acc, start_1, end_1, local_l_item_2, comp); + const std::size_t l_shift_1 = l_search_bound_1 - start_1; + const std::size_t l_shift_2 = local_start_2 - start_2; + + out_acc[start_out + l_shift_1 + l_shift_2] = local_l_item_2; + + std::size_t r_search_bound_1{}; + // find right border in 1st sequence + if (local_size_2 > 1) { + const auto local_r_item_2 = in_acc[local_end_2 - 1]; + r_search_bound_1 = upper_bound_impl( + in_acc, l_search_bound_1, end_1, local_r_item_2, comp); + const std::size_t r_shift_1 = r_search_bound_1 - start_1; + const std::size_t r_shift_2 = local_end_2 - 1 - start_2; + + out_acc[start_out + r_shift_1 + r_shift_2] = local_r_item_2; + } + + // Handle intermediate items + if (l_search_bound_1 == r_search_bound_1) { + const std::size_t shift_1 = l_search_bound_1 - start_1; + for (auto idx = local_start_2 + 1; idx < local_end_2 - 1; + ++idx) { + const auto intermediate_item_2 = in_acc[idx]; + const std::size_t shift_2 = idx - start_2; + out_acc[start_out + shift_1 + shift_2] = + intermediate_item_2; + } + } + else { + for (auto idx = local_start_2 + 1; idx < local_end_2 - 1; + ++idx) { + const auto intermediate_item_2 = in_acc[idx]; + // we shouldn't seek in whole 1st sequence. Just for the + // part where the 2nd sequence should be + l_search_bound_1 = upper_bound_impl( + in_acc, l_search_bound_1, r_search_bound_1, + intermediate_item_2, comp); + const std::size_t shift_1 = l_search_bound_1 - start_1; + const std::size_t shift_2 = idx - start_2; + + out_acc[start_out + shift_1 + shift_2] = + intermediate_item_2; + } + } + } + } +} + +template +void insertion_sort_impl(Iter &&first, + std::size_t begin, + std::size_t end, + Compare &&comp) +{ + for (std::size_t i = begin + 1; i < end; ++i) { + const auto val_i = first[i]; + std::size_t j = i - 1; + while ((j + 1 > begin) && (comp(val_i, first[j]))) { + first[j + 1] = first[j]; + --j; + } + if (j + 1 < i) { + first[j + 1] = val_i; + } + } +} + +template +void leaf_sort_impl(Iter &&first, + std::size_t begin, + std::size_t end, + Compare &&comp) +{ + return insertion_sort_impl(std::forward(first), + std::move(begin), std::move(end), + std::forward(comp)); +} + +template +struct GetValueType +{ + using value_type = typename std::iterator_traits::value_type; +}; + +template +struct GetValueType> +{ + using value_type = ElementType; +}; + +template +struct GetValueType< + sycl::accessor> +{ + using value_type = ElementType; +}; + +template +struct GetValueType> +{ + using value_type = ElementType; +}; + +template +struct GetReadOnlyAccess +{ + Iter operator()(const Iter &it, sycl::handler &) { return it; } +}; + +template +struct GetReadOnlyAccess> +{ + auto operator()(const sycl::buffer &buf, + sycl::handler &cgh) + { + sycl::accessor acc(buf, cgh, sycl::read_only); + return acc; + } +}; + +template +struct GetWriteDiscardAccess +{ + Iter operator()(Iter it, sycl::handler &) { return it; } +}; + +template +struct GetWriteDiscardAccess> +{ + auto operator()(sycl::buffer &buf, + sycl::handler &cgh) + { + sycl::accessor acc(buf, cgh, sycl::write_only, sycl::no_init); + return acc; + } +}; + +template +struct GetReadWriteAccess +{ + Iter operator()(Iter &it, sycl::handler &) { return it; } +}; + +template +struct GetReadWriteAccess> +{ + auto operator()(sycl::buffer &buf, + sycl::handler &cgh) + { + sycl::accessor acc(buf, cgh, sycl::read_write); + return acc; + } +}; + +template +class sort_base_step_contig_krn; + +template +sycl::event + sort_base_step_contig_impl(sycl::queue &q, + const std::size_t iter_nelems, + const std::size_t sort_nelems, + const InpAcc input, + OutAcc output, + const Comp &comp, + const std::size_t conseq_nelems_sorted, + const std::vector &depends = {}) +{ + + using inpT = typename GetValueType::value_type; + using outT = typename GetValueType::value_type; + using KernelName = sort_base_step_contig_krn; + + const std::size_t n_segments = + quotient_ceil(sort_nelems, conseq_nelems_sorted); + + sycl::event base_sort = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + const sycl::range<1> gRange{iter_nelems * n_segments}; + + auto input_acc = GetReadOnlyAccess{}(input, cgh); + auto output_acc = GetWriteDiscardAccess{}(output, cgh); + + cgh.parallel_for(gRange, [=](sycl::id<1> id) { + const std::size_t iter_id = id[0] / n_segments; + const std::size_t segment_id = id[0] - iter_id * n_segments; + + const std::size_t iter_offset = iter_id * sort_nelems; + const std::size_t beg_id = + iter_offset + segment_id * conseq_nelems_sorted; + const std::size_t end_id = + iter_offset + + std::min((segment_id + 1) * conseq_nelems_sorted, sort_nelems); + for (std::size_t i = beg_id; i < end_id; ++i) { + output_acc[i] = input_acc[i]; + } + + leaf_sort_impl(output_acc, beg_id, end_id, comp); + }); + }); + + return base_sort; +} + +template +class sort_over_work_group_contig_krn; + +template +sycl::event sort_over_work_group_contig_impl( + sycl::queue &q, + std::size_t iter_nelems, + std::size_t sort_nelems, + const InpAcc input, + OutAcc output, + const Comp &comp, + std::size_t &nelems_wg_sorts, + const std::vector &depends = {}) +{ + using inpT = typename GetValueType::value_type; + using T = typename GetValueType::value_type; + using KernelName = sort_over_work_group_contig_krn; + + const auto &kernel_id = sycl::get_kernel_id(); + + auto const &ctx = q.get_context(); + auto const &dev = q.get_device(); + auto kb = sycl::get_kernel_bundle( + ctx, {dev}, {kernel_id}); + + auto krn = kb.get_kernel(kernel_id); + + const std::uint32_t max_sg_size = krn.template get_info< + sycl::info::kernel_device_specific::max_sub_group_size>(dev); + const std::uint64_t device_local_memory_size = + dev.get_info(); + + // leave 512 bytes of local memory for RT + const std::uint64_t safety_margin = 512; + + const std::uint64_t nelems_per_slm = + (device_local_memory_size - safety_margin) / (2 * sizeof(T)); + + static constexpr std::uint32_t sub_groups_per_work_group = 4; + const std::uint32_t elems_per_wi = dev.has(sycl::aspect::cpu) ? 8 : 2; + + const std::size_t lws = sub_groups_per_work_group * max_sg_size; + + nelems_wg_sorts = elems_per_wi * lws; + + if (nelems_wg_sorts > nelems_per_slm) { + nelems_wg_sorts = (q.get_device().has(sycl::aspect::cpu) ? 16 : 4); + + return sort_base_step_contig_impl( + q, iter_nelems, sort_nelems, input, output, comp, nelems_wg_sorts, + depends); + } + + // This assumption permits doing away with using a loop + assert(nelems_wg_sorts % lws == 0); + + const std::size_t n_segments = quotient_ceil(sort_nelems, nelems_wg_sorts); + + sycl::event base_sort_ev = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + cgh.use_kernel_bundle(kb); + + sycl::range<1> global_range{iter_nelems * n_segments * lws}; + sycl::range<1> local_range{lws}; + + sycl::range<1> slm_range{nelems_wg_sorts}; + sycl::local_accessor work_space(slm_range, cgh); + sycl::local_accessor scratch_space(slm_range, cgh); + + auto input_acc = GetReadOnlyAccess{}(input, cgh); + auto output_acc = GetWriteDiscardAccess{}(output, cgh); + + sycl::nd_range<1> ndRange(global_range, local_range); + + cgh.parallel_for(ndRange, [=](sycl::nd_item<1> it) { + const std::size_t group_id = it.get_group_linear_id(); + const std::size_t iter_id = group_id / n_segments; + const std::size_t segment_id = group_id - iter_id * n_segments; + const std::size_t lid = it.get_local_linear_id(); + + const std::size_t segment_start_idx = segment_id * nelems_wg_sorts; + const std::size_t segment_end_idx = + std::min(segment_start_idx + nelems_wg_sorts, sort_nelems); + const std::size_t wg_chunk_size = + segment_end_idx - segment_start_idx; + + // load input into SLM + for (std::size_t array_id = segment_start_idx + lid; + array_id < segment_end_idx; array_id += lws) { + T v = (array_id < sort_nelems) + ? input_acc[iter_id * sort_nelems + array_id] + : T{}; + work_space[array_id - segment_start_idx] = v; + } + sycl::group_barrier(it.get_group()); + + const std::size_t chunk = quotient_ceil(nelems_wg_sorts, lws); + + const std::size_t chunk_start_idx = lid * chunk; + const std::size_t chunk_end_idx = + sycl::min(chunk_start_idx + chunk, wg_chunk_size); + + leaf_sort_impl(work_space, chunk_start_idx, chunk_end_idx, comp); + + sycl::group_barrier(it.get_group()); + + bool data_in_temp = false; + std::size_t n_chunks_merged = 1; + + // merge chunk while n_chunks_merged * chunk < wg_chunk_size + const std::size_t max_chunks_merged = + 1 + ((wg_chunk_size - 1) / chunk); + for (; n_chunks_merged < max_chunks_merged; + data_in_temp = !data_in_temp, n_chunks_merged *= 2) { + const std::size_t nelems_sorted_so_far = + n_chunks_merged * chunk; + const std::size_t q = (lid / n_chunks_merged); + const std::size_t start_1 = + sycl::min(2 * nelems_sorted_so_far * q, wg_chunk_size); + const std::size_t end_1 = + sycl::min(start_1 + nelems_sorted_so_far, wg_chunk_size); + const std::size_t end_2 = + sycl::min(end_1 + nelems_sorted_so_far, wg_chunk_size); + const std::size_t offset = chunk * (lid - q * n_chunks_merged); + + if (data_in_temp) { + merge_impl(offset, scratch_space, work_space, start_1, + end_1, end_2, start_1, comp, chunk); + } + else { + merge_impl(offset, work_space, scratch_space, start_1, + end_1, end_2, start_1, comp, chunk); + } + sycl::group_barrier(it.get_group()); + } + + const auto &out_src = (data_in_temp) ? scratch_space : work_space; + for (std::size_t array_id = segment_start_idx + lid; + array_id < segment_end_idx; array_id += lws) { + if (array_id < sort_nelems) { + output_acc[iter_id * sort_nelems + array_id] = + out_src[array_id - segment_start_idx]; + } + } + }); + }); + + return base_sort_ev; +} + +class vacuous_krn; + +inline sycl::event tie_events(sycl::queue &q, + const std::vector depends) +{ + if (depends.empty()) + return sycl::event(); + if (depends.size() == 1) + return depends[0]; + + sycl::event e = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + using KernelName = vacuous_krn; + cgh.single_task([]() {}); + }); + + return e; +} + +template +class merge_adjacent_blocks_to_temp_krn; + +template +class merge_adjacent_blocks_from_temp_krn; + +template +sycl::event + merge_sorted_block_contig_impl(sycl::queue &q, + std::size_t iter_nelems, + std::size_t sort_nelems, + Acc output, + const Comp comp, + std::size_t sorted_block_size, + const std::vector &depends = {}) +{ + + if (sorted_block_size >= sort_nelems) + return tie_events(q, depends); + + // experimentally determined value + // size of segments worked upon by each work-item during merging + const sycl::device &dev = q.get_device(); + const std::size_t segment_size = (dev.has(sycl::aspect::cpu)) ? 32 : 4; + + const std::size_t chunk_size = + (sorted_block_size < segment_size) ? sorted_block_size : segment_size; + + assert(sorted_block_size % chunk_size == 0); + + using T = typename GetValueType::value_type; + + sycl::buffer temp_buf(sycl::range<1>{iter_nelems * sort_nelems}); + // T *allocated_mem = sycl::malloc_device(iter_nelems * sort_nelems, q); + + bool needs_copy = true; + bool used_depends = false; + + sycl::event dep_ev; + std::size_t chunks_merged = sorted_block_size / chunk_size; + + assert(!(chunks_merged & (chunks_merged - 1))); + + using ToTempKernelName = class merge_adjacent_blocks_to_temp_krn; + using FromTempKernelName = + class merge_adjacent_blocks_from_temp_krn; + + while (chunks_merged * chunk_size < sort_nelems) { + sycl::event local_dep = dep_ev; + + sycl::event merge_ev = q.submit([&](sycl::handler &cgh) { + if (used_depends) { + cgh.depends_on(local_dep); + } + else { + cgh.depends_on(depends); + used_depends = true; + } + + const std::size_t n_chunks = quotient_ceil(sort_nelems, chunk_size); + + if (needs_copy) { + sycl::accessor temp_acc{temp_buf, cgh, sycl::write_only, + sycl::no_init}; + auto output_acc = GetReadOnlyAccess{}(output, cgh); + cgh.parallel_for( + {iter_nelems * n_chunks}, [=](sycl::id<1> wid) { + auto flat_idx = wid[0]; + auto iter_idx = flat_idx / n_chunks; + auto idx = flat_idx - n_chunks * iter_idx; + + const std::size_t idx_mult = + (idx / chunks_merged) * chunks_merged; + const std::size_t idx_rem = (idx - idx_mult); + const std::size_t start_1 = + sycl::min(2 * idx_mult * chunk_size, sort_nelems); + const std::size_t end_1 = sycl::min( + start_1 + chunks_merged * chunk_size, sort_nelems); + const std::size_t end_2 = sycl::min( + end_1 + chunks_merged * chunk_size, sort_nelems); + const std::size_t offset = chunk_size * idx_rem; + + const std::size_t iter_offset = iter_idx * sort_nelems; + + merge_impl(offset, output_acc, temp_acc, + iter_offset + start_1, iter_offset + end_1, + iter_offset + end_2, iter_offset + start_1, + comp, chunk_size); + }); + } + else { + sycl::accessor temp_acc{temp_buf, cgh, sycl::read_only}; + auto output_acc = GetWriteDiscardAccess{}(output, cgh); + cgh.parallel_for( + {iter_nelems * n_chunks}, [=](sycl::id<1> wid) { + auto flat_idx = wid[0]; + auto iter_idx = flat_idx / n_chunks; + auto idx = flat_idx - n_chunks * iter_idx; + + const std::size_t idx_mult = + (idx / chunks_merged) * chunks_merged; + const std::size_t idx_rem = (idx - idx_mult); + const std::size_t start_1 = + sycl::min(2 * idx_mult * chunk_size, sort_nelems); + const std::size_t end_1 = sycl::min( + start_1 + chunks_merged * chunk_size, sort_nelems); + const std::size_t end_2 = sycl::min( + end_1 + chunks_merged * chunk_size, sort_nelems); + const std::size_t offset = chunk_size * idx_rem; + + const std::size_t iter_offset = iter_idx * sort_nelems; + + merge_impl(offset, temp_acc, output_acc, + iter_offset + start_1, iter_offset + end_1, + iter_offset + end_2, iter_offset + start_1, + comp, chunk_size); + }); + } + }); + + chunks_merged *= 2; + dep_ev = merge_ev; + + if (chunks_merged * chunk_size < sort_nelems) { + needs_copy = !needs_copy; + } + } + + if (needs_copy) { + sycl::event copy_ev = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(dep_ev); + + sycl::accessor temp_acc{temp_buf, cgh, sycl::read_only}; + auto output_acc = GetWriteDiscardAccess{}(output, cgh); + + cgh.copy(temp_acc, output_acc); + }); + dep_ev = copy_ev; + } + + return dep_ev; +} + +} // namespace merge_sort_detail + +template > +sycl::event stable_sort_axis1_contig_impl( + sycl::queue &exec_q, + std::size_t iter_nelems, // number of sub-arrays to sort (num. of rows in a + // matrix when sorting over rows) + std::size_t sort_nelems, // size of each array to sort (length of rows, + // i.e. number of columns) + const char *arg_cp, + char *res_cp, + ssize_t iter_arg_offset, + ssize_t iter_res_offset, + ssize_t sort_arg_offset, + ssize_t sort_res_offset, + const std::vector &depends) +{ + const argTy *arg_tp = reinterpret_cast(arg_cp) + + iter_arg_offset + sort_arg_offset; + argTy *res_tp = + reinterpret_cast(res_cp) + iter_res_offset + sort_res_offset; + + auto comp = Comp{}; + + // constant chosen experimentally to ensure monotonicity of + // sorting performance, as measured on GPU Max, and Iris Xe + constexpr std::size_t sequential_sorting_threshold = 16; + + if (sort_nelems < sequential_sorting_threshold) { + // equal work-item sorts entire row + sycl::event sequential_sorting_ev = + merge_sort_detail::sort_base_step_contig_impl( + exec_q, iter_nelems, sort_nelems, arg_tp, res_tp, comp, + sort_nelems, depends); + + return sequential_sorting_ev; + } + else { + std::size_t sorted_block_size{}; + + // Sort segments of the array + sycl::event base_sort_ev = + merge_sort_detail::sort_over_work_group_contig_impl( + exec_q, iter_nelems, sort_nelems, arg_tp, res_tp, comp, + sorted_block_size, // modified in place with size of sorted + // block size + depends); + + // Merge segments in parallel until all elements are sorted + sycl::event merges_ev = + merge_sort_detail::merge_sorted_block_contig_impl( + exec_q, iter_nelems, sort_nelems, res_tp, comp, + sorted_block_size, {base_sort_ev}); + + return merges_ev; + } +} + +template +class populate_index_data_krn; + +template +class index_map_to_rows_krn; + +template +struct IndexComp +{ + IndexComp(const ValueT *data, const ValueComp &comp_op) + : ptr(data), value_comp(comp_op) + { + } + + bool operator()(const IndexT &i1, const IndexT &i2) const + { + return value_comp(ptr[i1], ptr[i2]); + } + +private: + const ValueT *ptr; + ValueComp value_comp; +}; + +template > +sycl::event stable_argsort_axis1_contig_impl( + sycl::queue &exec_q, + std::size_t iter_nelems, // number of sub-arrays to sort (num. of rows in a + // matrix when sorting over rows) + std::size_t sort_nelems, // size of each array to sort (length of rows, + // i.e. number of columns) + const char *arg_cp, + char *res_cp, + ssize_t iter_arg_offset, + ssize_t iter_res_offset, + ssize_t sort_arg_offset, + ssize_t sort_res_offset, + const std::vector &depends) +{ + const argTy *arg_tp = reinterpret_cast(arg_cp) + + iter_arg_offset + sort_arg_offset; + IndexTy *res_tp = + reinterpret_cast(res_cp) + iter_res_offset + sort_res_offset; + + const IndexComp index_comp{arg_tp, ValueComp{}}; + + static constexpr std::size_t determine_automatically = 0; + std::size_t sorted_block_size = determine_automatically; + + const std::size_t total_nelems = iter_nelems * sort_nelems; + + using dpctl::tensor::kernels::sort_utils_detail::iota_impl; + + using IotaKernelName = populate_index_data_krn; + + sycl::event populate_indexed_data_ev = iota_impl( + exec_q, res_tp, total_nelems, depends); + + // Sort segments of the array + sycl::event base_sort_ev = + merge_sort_detail::sort_over_work_group_contig_impl( + exec_q, iter_nelems, sort_nelems, res_tp, res_tp, index_comp, + sorted_block_size, // modified in place with size of sorted block + // size + {populate_indexed_data_ev}); + + // Merge segments in parallel until all elements are sorted + sycl::event merges_ev = merge_sort_detail::merge_sorted_block_contig_impl( + exec_q, iter_nelems, sort_nelems, res_tp, index_comp, sorted_block_size, + {base_sort_ev}); + + // no need to map back if iter_nelems == 1 + if (iter_nelems == 1u) { + return merges_ev; + } + + using MapBackKernelName = index_map_to_rows_krn; + using dpctl::tensor::kernels::sort_utils_detail::map_back_impl; + + sycl::event write_out_ev = map_back_impl( + exec_q, total_nelems, res_tp, res_tp, sort_nelems, {merges_ev}); + + return write_out_ev; +} + +} // namespace dpctl::tensor::kernels diff --git a/dpnp/tensor/libtensor/include/kernels/sorting/radix_sort.hpp b/dpnp/tensor/libtensor/include/kernels/sorting/radix_sort.hpp new file mode 100644 index 000000000000..5baa98e237df --- /dev/null +++ b/dpnp/tensor/libtensor/include/kernels/sorting/radix_sort.hpp @@ -0,0 +1,1905 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_sorting_impl +/// extension. +//===----------------------------------------------------------------------===// + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "kernels/dpctl_tensor_types.hpp" +#include "kernels/sorting/sort_utils.hpp" +#include "utils/sycl_alloc_utils.hpp" + +namespace dpctl::tensor::kernels +{ + +namespace radix_sort_details +{ + +template +class radix_sort_count_kernel; + +template +class radix_sort_scan_kernel; + +template +class radix_sort_reorder_peer_kernel; + +template +class radix_sort_reorder_kernel; + +/*! @brief Computes smallest exponent such that `n <= (1 << exponent)` */ +template && + sizeof(SizeT) == sizeof(std::uint64_t), + int> = 0> +std::uint32_t ceil_log2(SizeT n) +{ + // if n > 2^b, n = q * 2^b + r for q > 0 and 0 <= r < 2^b + // floor_log2(q * 2^b + r) == floor_log2(q * 2^b) == q + floor_log2(n1) + // ceil_log2(n) == 1 + floor_log2(n-1) + if (n <= 1) + return std::uint32_t{1}; + + std::uint32_t exp{1}; + --n; + if (n >= (SizeT{1} << 32)) { + n >>= 32; + exp += 32; + } + if (n >= (SizeT{1} << 16)) { + n >>= 16; + exp += 16; + } + if (n >= (SizeT{1} << 8)) { + n >>= 8; + exp += 8; + } + if (n >= (SizeT{1} << 4)) { + n >>= 4; + exp += 4; + } + if (n >= (SizeT{1} << 2)) { + n >>= 2; + exp += 2; + } + if (n >= (SizeT{1} << 1)) { + n >>= 1; + ++exp; + } + return exp; +} + +//---------------------------------------------------------- +// bitwise order-preserving conversions to unsigned integers +//---------------------------------------------------------- + +template +bool order_preserving_cast(bool val) +{ + if constexpr (is_ascending) + return val; + else + return !val; +} + +template , int> = 0> +UIntT order_preserving_cast(UIntT val) +{ + if constexpr (is_ascending) { + return val; + } + else { + // bitwise invert + return (~val); + } +} + +template && std::is_signed_v, + int> = 0> +std::make_unsigned_t order_preserving_cast(IntT val) +{ + using UIntT = std::make_unsigned_t; + const UIntT uint_val = sycl::bit_cast(val); + + if constexpr (is_ascending) { + // ascending_mask: 100..0 + static constexpr UIntT ascending_mask = + (UIntT(1) << std::numeric_limits::digits); + return (uint_val ^ ascending_mask); + } + else { + // descending_mask: 011..1 + static constexpr UIntT descending_mask = + (std::numeric_limits::max() >> 1); + return (uint_val ^ descending_mask); + } +} + +template +std::uint16_t order_preserving_cast(sycl::half val) +{ + using UIntT = std::uint16_t; + + const UIntT uint_val = sycl::bit_cast( + (sycl::isnan(val)) ? std::numeric_limits::quiet_NaN() + : val); + UIntT mask; + + // test the sign bit of the original value + const bool zero_fp_sign_bit = (UIntT(0) == (uint_val >> 15)); + + static constexpr UIntT zero_mask = UIntT(0x8000u); + static constexpr UIntT nonzero_mask = UIntT(0xFFFFu); + + static constexpr UIntT inv_zero_mask = static_cast(~zero_mask); + static constexpr UIntT inv_nonzero_mask = static_cast(~nonzero_mask); + + if constexpr (is_ascending) { + mask = (zero_fp_sign_bit) ? zero_mask : nonzero_mask; + } + else { + mask = (zero_fp_sign_bit) ? (inv_zero_mask) : (inv_nonzero_mask); + } + + return (uint_val ^ mask); +} + +template && + sizeof(FloatT) == sizeof(std::uint32_t), + int> = 0> +std::uint32_t order_preserving_cast(FloatT val) +{ + using UIntT = std::uint32_t; + + UIntT uint_val = sycl::bit_cast( + (sycl::isnan(val)) ? std::numeric_limits::quiet_NaN() : val); + + UIntT mask; + + // test the sign bit of the original value + const bool zero_fp_sign_bit = (UIntT(0) == (uint_val >> 31)); + + static constexpr UIntT zero_mask = UIntT(0x80000000u); + static constexpr UIntT nonzero_mask = UIntT(0xFFFFFFFFu); + + if constexpr (is_ascending) + mask = (zero_fp_sign_bit) ? zero_mask : nonzero_mask; + else + mask = (zero_fp_sign_bit) ? (~zero_mask) : (~nonzero_mask); + + return (uint_val ^ mask); +} + +template && + sizeof(FloatT) == sizeof(std::uint64_t), + int> = 0> +std::uint64_t order_preserving_cast(FloatT val) +{ + using UIntT = std::uint64_t; + + UIntT uint_val = sycl::bit_cast( + (sycl::isnan(val)) ? std::numeric_limits::quiet_NaN() : val); + UIntT mask; + + // test the sign bit of the original value + const bool zero_fp_sign_bit = (UIntT(0) == (uint_val >> 63)); + + static constexpr UIntT zero_mask = UIntT(0x8000000000000000u); + static constexpr UIntT nonzero_mask = UIntT(0xFFFFFFFFFFFFFFFFu); + + if constexpr (is_ascending) + mask = (zero_fp_sign_bit) ? zero_mask : nonzero_mask; + else + mask = (zero_fp_sign_bit) ? (~zero_mask) : (~nonzero_mask); + + return (uint_val ^ mask); +} + +//----------------- +// bucket functions +//----------------- + +template +constexpr std::size_t number_of_bits_in_type() +{ + constexpr std::size_t type_bits = + (sizeof(T) * std::numeric_limits::digits); + return type_bits; +} + +// the number of buckets (size of radix bits) in T +template +constexpr std::uint32_t number_of_buckets_in_type(std::uint32_t radix_bits) +{ + constexpr std::size_t type_bits = number_of_bits_in_type(); + return (type_bits + radix_bits - 1) / radix_bits; +} + +// get bits value (bucket) in a certain radix position +template +std::uint32_t get_bucket_id(T val, std::uint32_t radix_offset) +{ + static_assert(std::is_unsigned_v); + + return (val >> radix_offset) & T(radix_mask); +} + +//-------------------------------- +// count kernel (single iteration) +//-------------------------------- + +template +sycl::event + radix_sort_count_submit(sycl::queue &exec_q, + std::size_t n_iters, + std::size_t n_segments, + std::size_t wg_size, + std::uint32_t radix_offset, + std::size_t n_values, + ValueT *vals_ptr, + std::size_t n_counts, + CountT *counts_ptr, + const Proj &proj_op, + const bool is_ascending, + const std::vector &dependency_events) +{ + // bin_count = radix_states used for an array storing bucket state counters + static constexpr std::uint32_t radix_states = + (std::uint32_t(1) << radix_bits); + static constexpr std::uint32_t radix_mask = radix_states - 1; + + // iteration space info + const std::size_t n = n_values; + // each segment is processed by a work-group + const std::size_t elems_per_segment = (n + n_segments - 1) / n_segments; + const std::size_t no_op_flag_id = n_counts - 1; + + assert(n_counts == (n_segments + 1) * radix_states + 1); + + sycl::event local_count_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(dependency_events); + + sycl::local_accessor counts_lacc(wg_size * radix_states, + cgh); + + sycl::nd_range<1> ndRange(n_iters * n_segments * wg_size, wg_size); + + cgh.parallel_for(ndRange, [=](sycl::nd_item<1> ndit) { + // 0 <= lid < wg_size + const std::size_t lid = ndit.get_local_id(0); + // 0 <= group_id < n_segments * n_iters + const std::size_t group_id = ndit.get_group(0); + const std::size_t iter_id = group_id / n_segments; + const std::size_t val_iter_offset = iter_id * n; + // 0 <= wgr_id < n_segments + const std::size_t wgr_id = group_id - iter_id * n_segments; + + const std::size_t seg_start = elems_per_segment * wgr_id; + + // count per work-item: create a private array for storing count + // values here bin_count = radix_states + std::array counts_arr = {CountT{0}}; + + // count per work-item: count values and write result to private + // count array + const std::size_t seg_end = + sycl::min(seg_start + elems_per_segment, n); + if (is_ascending) { + for (std::size_t val_id = seg_start + lid; val_id < seg_end; + val_id += wg_size) { + // get the bucket for the bit-ordered input value, + // applying the offset and mask for radix bits + const auto val = + order_preserving_cast( + proj_op(vals_ptr[val_iter_offset + val_id])); + const std::uint32_t bucket_id = + get_bucket_id(val, radix_offset); + + // increment counter for this bit bucket + ++counts_arr[bucket_id]; + } + } + else { + for (std::size_t val_id = seg_start + lid; val_id < seg_end; + val_id += wg_size) { + // get the bucket for the bit-ordered input value, + // applying the offset and mask for radix bits + const auto val = + order_preserving_cast( + proj_op(vals_ptr[val_iter_offset + val_id])); + const std::uint32_t bucket_id = + get_bucket_id(val, radix_offset); + + // increment counter for this bit bucket + ++counts_arr[bucket_id]; + } + } + + // count per work-item: write private count array to local count + // array counts_lacc is concatenation of private count arrays from + // each work-item in the order of their local ids + const std::uint32_t count_start_id = radix_states * lid; + for (std::uint32_t radix_state_id = 0; + radix_state_id < radix_states; ++radix_state_id) { + counts_lacc[count_start_id + radix_state_id] = + counts_arr[radix_state_id]; + } + + sycl::group_barrier(ndit.get_group()); + + // count per work-group: reduce till count_lacc[] size > wg_size + // all work-items in the work-group do the work. + for (std::uint32_t i = 1; i < radix_states; ++i) { + // Since we interested in computing total count over work-group + // for each radix state, the correct result is only assured if + // wg_size >= radix_states + counts_lacc[lid] += counts_lacc[wg_size * i + lid]; + } + + sycl::group_barrier(ndit.get_group()); + + // count per work-group: reduce until count_lacc[] size > + // radix_states (n_witems /= 2 per iteration) + for (std::uint32_t n_witems = (wg_size >> 1); + n_witems >= radix_states; n_witems >>= 1) { + if (lid < n_witems) + counts_lacc[lid] += counts_lacc[n_witems + lid]; + + sycl::group_barrier(ndit.get_group()); + } + + const std::size_t iter_counter_offset = iter_id * n_counts; + + // count per work-group: write local count array to global count + // array + if (lid < radix_states) { + // move buckets with the same id to adjacent positions, + // thus splitting count array into radix_states regions + counts_ptr[iter_counter_offset + (n_segments + 1) * lid + + wgr_id] = counts_lacc[lid]; + } + + // side work: reset 'no-operation-flag', signaling to skip re-order + // phase + if (wgr_id == 0 && lid == 0) { + CountT &no_op_flag = + counts_ptr[iter_counter_offset + no_op_flag_id]; + no_op_flag = 0; + } + }); + }); + + return local_count_ev; +} + +//----------------------------------------------------------------------- +// radix sort: scan kernel (single iteration) +//----------------------------------------------------------------------- + +template +sycl::event radix_sort_scan_submit(sycl::queue &exec_q, + std::size_t n_iters, + std::size_t n_segments, + std::size_t wg_size, + std::size_t n_values, + std::size_t n_counts, + CountT *counts_ptr, + const std::vector depends) +{ + const std::size_t no_op_flag_id = n_counts - 1; + + // Scan produces local offsets using count values. + // There are no local offsets for the first segment, but the rest segments + // should be scanned with respect to the count value in the first segment + // what requires n + 1 positions + const std::size_t scan_size = n_segments + 1; + wg_size = std::min(scan_size, wg_size); + + static constexpr std::uint32_t radix_states = std::uint32_t(1) + << radix_bits; + + // compilation of the kernel prevents out of resources issue, which may + // occur due to usage of collective algorithms such as joint_exclusive_scan + // even if local memory is not explicitly requested + sycl::event scan_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + sycl::nd_range<1> ndRange(n_iters * radix_states * wg_size, wg_size); + + cgh.parallel_for(ndRange, [=](sycl::nd_item<1> ndit) { + const std::size_t group_id = ndit.get_group(0); + const std::size_t iter_id = group_id / radix_states; + const std::size_t wgr_id = group_id - iter_id * radix_states; + // find borders of a region with a specific bucket id + auto begin_ptr = + counts_ptr + scan_size * wgr_id + iter_id * n_counts; + + sycl::joint_exclusive_scan(ndit.get_group(), begin_ptr, + begin_ptr + scan_size, begin_ptr, + CountT(0), sycl::plus{}); + + const auto lid = ndit.get_local_linear_id(); + + // NB: No race condition here, because the condition may ever be + // true for only on one WG, one WI. + if ((lid == wg_size - 1) && + (begin_ptr[scan_size - 1] == n_values)) { + // set flag, since all the values got into one + // this is optimization, may happy often for + // higher radix offsets (all zeros) + auto &no_op_flag = + counts_ptr[iter_id * n_counts + no_op_flag_id]; + no_op_flag = 1; + } + }); + }); + + return scan_ev; +} + +//----------------------------------------------------------------------- +// radix sort: group level reorder algorithms +//----------------------------------------------------------------------- + +struct empty_storage +{ + template + empty_storage(T &&...) + { + } +}; + +// Number with `n` least significant bits of uint32_t +inline std::uint32_t n_ls_bits_set(std::uint32_t n) noexcept +{ + static constexpr std::uint32_t zero{}; + static constexpr std::uint32_t all_bits_set = ~zero; + + return ~(all_bits_set << n); +} + +enum class peer_prefix_algo +{ + subgroup_ballot, + atomic_fetch_or, + scan_then_broadcast +}; + +template +struct peer_prefix_helper; + +template +auto get_accessor_pointer(const AccT &acc) +{ + return acc.template get_multi_ptr().get(); +} + +template +struct peer_prefix_helper +{ + using AtomicT = sycl::atomic_ref; + using TempStorageT = sycl::local_accessor; + +private: + sycl::sub_group sgroup; + std::uint32_t lid; + std::uint32_t item_mask; + AtomicT atomic_peer_mask; + +public: + peer_prefix_helper(sycl::nd_item<1> ndit, TempStorageT lacc) + : sgroup(ndit.get_sub_group()), lid(ndit.get_local_linear_id()), + item_mask(n_ls_bits_set(lid)), atomic_peer_mask(lacc[0]) + { + } + + std::uint32_t peer_contribution(OffsetT &new_offset_id, + OffsetT offset_prefix, + bool wi_bit_set) const + { + // reset mask for each radix state + if (lid == 0) + atomic_peer_mask.store(std::uint32_t{0}); + sycl::group_barrier(sgroup); + + const std::uint32_t uint_contrib{wi_bit_set ? std::uint32_t{1} + : std::uint32_t{0}}; + + // set local id's bit to 1 if the bucket value matches the radix state + atomic_peer_mask.fetch_or(uint_contrib << lid); + sycl::group_barrier(sgroup); + std::uint32_t peer_mask_bits = atomic_peer_mask.load(); + std::uint32_t sg_total_offset = sycl::popcount(peer_mask_bits); + + // get the local offset index from the bits set in the peer mask with + // index less than the work item ID + peer_mask_bits &= item_mask; + new_offset_id |= wi_bit_set + ? (offset_prefix + sycl::popcount(peer_mask_bits)) + : OffsetT{0}; + return sg_total_offset; + } +}; + +template +struct peer_prefix_helper +{ + using TempStorageT = empty_storage; + using ItemType = sycl::nd_item<1>; + using SubGroupType = sycl::sub_group; + +private: + SubGroupType sgroup; + std::uint32_t sg_size; + +public: + peer_prefix_helper(sycl::nd_item<1> ndit, TempStorageT) + : sgroup(ndit.get_sub_group()), sg_size(sgroup.get_local_range()[0]) + { + } + + std::uint32_t peer_contribution(OffsetT &new_offset_id, + OffsetT offset_prefix, + bool wi_bit_set) const + { + const std::uint32_t contrib{wi_bit_set ? std::uint32_t{1} + : std::uint32_t{0}}; + + std::uint32_t sg_item_offset = sycl::exclusive_scan_over_group( + sgroup, contrib, sycl::plus{}); + + new_offset_id |= + (wi_bit_set ? (offset_prefix + sg_item_offset) : OffsetT(0)); + + // the last scanned value does not contain number of all copies, thus + // adding contribution + std::uint32_t sg_total_offset = sycl::group_broadcast( + sgroup, sg_item_offset + contrib, sg_size - 1); + + return sg_total_offset; + } +}; + +template +struct peer_prefix_helper +{ +private: + sycl::sub_group sgroup; + std::uint32_t lid; + sycl::ext::oneapi::sub_group_mask item_sg_mask; + + sycl::ext::oneapi::sub_group_mask mask_builder(std::uint32_t mask, + std::uint32_t sg_size) + { + return sycl::detail::Builder::createSubGroupMask< + sycl::ext::oneapi::sub_group_mask>(mask, sg_size); + } + +public: + using TempStorageT = empty_storage; + + peer_prefix_helper(sycl::nd_item<1> ndit, TempStorageT) + : sgroup(ndit.get_sub_group()), lid(ndit.get_local_linear_id()), + item_sg_mask( + mask_builder(n_ls_bits_set(lid), sgroup.get_local_linear_range())) + { + } + + std::uint32_t peer_contribution(OffsetT &new_offset_id, + OffsetT offset_prefix, + bool wi_bit_set) const + { + // set local id's bit to 1 if the bucket value matches the radix state + auto peer_mask = sycl::ext::oneapi::group_ballot(sgroup, wi_bit_set); + std::uint32_t peer_mask_bits{}; + + peer_mask.extract_bits(peer_mask_bits); + std::uint32_t sg_total_offset = sycl::popcount(peer_mask_bits); + + // get the local offset index from the bits set in the peer mask with + // index less than the work item ID + peer_mask &= item_sg_mask; + peer_mask.extract_bits(peer_mask_bits); + + new_offset_id |= wi_bit_set + ? (offset_prefix + sycl::popcount(peer_mask_bits)) + : OffsetT(0); + + return sg_total_offset; + } +}; + +template +void copy_func_for_radix_sort(const std::size_t n_segments, + const std::size_t elems_per_segment, + const std::size_t sg_size, + const std::uint32_t lid, + const std::size_t wgr_id, + const InputT *input_ptr, + const std::size_t n_values, + OutputT *output_ptr) +{ + // item info + const std::size_t seg_start = elems_per_segment * wgr_id; + + std::size_t seg_end = sycl::min(seg_start + elems_per_segment, n_values); + + // ensure that each work item in a subgroup does the same number of loop + // iterations + const std::uint16_t tail_size = (seg_end - seg_start) % sg_size; + seg_end -= tail_size; + + // find offsets for the same values within a segment and fill the resulting + // buffer + for (std::size_t val_id = seg_start + lid; val_id < seg_end; + val_id += sg_size) { + output_ptr[val_id] = std::move(input_ptr[val_id]); + } + + if (tail_size > 0 && lid < tail_size) { + const std::size_t val_id = seg_end + lid; + output_ptr[val_id] = std::move(input_ptr[val_id]); + } +} + +//----------------------------------------------------------------------- +// radix sort: reorder kernel (per iteration) +//----------------------------------------------------------------------- +template +sycl::event + radix_sort_reorder_submit(sycl::queue &exec_q, + std::size_t n_iters, + std::size_t n_segments, + std::uint32_t radix_offset, + std::size_t n_values, + const InputT *input_ptr, + OutputT *output_ptr, + std::size_t n_offsets, + OffsetT *offset_ptr, + const ProjT &proj_op, + const bool is_ascending, + const std::vector dependency_events) +{ + using ValueT = InputT; + using PeerHelper = peer_prefix_helper; + + static constexpr std::uint32_t radix_states = std::uint32_t{1} + << radix_bits; + static constexpr std::uint32_t radix_mask = radix_states - 1; + const std::size_t elems_per_segment = + (n_values + n_segments - 1) / n_segments; + + const std::size_t no_op_flag_id = n_offsets - 1; + + const auto &kernel_id = sycl::get_kernel_id(); + + auto const &ctx = exec_q.get_context(); + auto const &dev = exec_q.get_device(); + auto kb = sycl::get_kernel_bundle( + ctx, {dev}, {kernel_id}); + + auto krn = kb.get_kernel(kernel_id); + + const std::uint32_t sg_size = krn.template get_info< + sycl::info::kernel_device_specific::max_sub_group_size>(dev); + + sycl::event reorder_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(dependency_events); + cgh.use_kernel_bundle(kb); + + using StorageT = typename PeerHelper::TempStorageT; + + StorageT peer_temp(1, cgh); + + sycl::range<1> lRange{sg_size}; + sycl::range<1> gRange{n_iters * n_segments * sg_size}; + + sycl::nd_range<1> ndRange{gRange, lRange}; + + cgh.parallel_for(ndRange, [=](sycl::nd_item<1> ndit) { + const std::size_t group_id = ndit.get_group(0); + const std::size_t iter_id = group_id / n_segments; + const std::size_t segment_id = group_id - iter_id * n_segments; + + auto b_offset_ptr = offset_ptr + iter_id * n_offsets; + auto b_input_ptr = input_ptr + iter_id * n_values; + auto b_output_ptr = output_ptr + iter_id * n_values; + + const std::uint32_t lid = ndit.get_local_id(0); + + auto &no_op_flag = b_offset_ptr[no_op_flag_id]; + if (no_op_flag) { + // no reordering necessary, simply copy + copy_func_for_radix_sort( + n_segments, elems_per_segment, sg_size, lid, segment_id, + b_input_ptr, n_values, b_output_ptr); + return; + } + + // create a private array for storing offset values + // and add total offset and offset for compute unit + // for a certain radix state + std::array offset_arr{}; + const std::size_t scan_size = n_segments + 1; + + OffsetT scanned_bin = 0; + + /* find cumulative offset */ + static constexpr std::uint32_t zero_radix_state_id = 0; + offset_arr[zero_radix_state_id] = b_offset_ptr[segment_id]; + + for (std::uint32_t radix_state_id = 1; + radix_state_id < radix_states; ++radix_state_id) { + const std::uint32_t local_offset_id = + segment_id + scan_size * radix_state_id; + + // scan bins serially + const std::size_t last_segment_bucket_id = + radix_state_id * scan_size - 1; + scanned_bin += b_offset_ptr[last_segment_bucket_id]; + + offset_arr[radix_state_id] = + scanned_bin + b_offset_ptr[local_offset_id]; + } + + const std::size_t seg_start = elems_per_segment * segment_id; + std::size_t seg_end = + sycl::min(seg_start + elems_per_segment, n_values); + // ensure that each work item in a subgroup does the same number of + // loop iterations + const std::uint32_t tail_size = (seg_end - seg_start) % sg_size; + seg_end -= tail_size; + + const PeerHelper peer_prefix_hlp(ndit, peer_temp); + + // find offsets for the same values within a segment and fill the + // resulting buffer + if (is_ascending) { + for (std::size_t val_id = seg_start + lid; val_id < seg_end; + val_id += sg_size) { + ValueT in_val = std::move(b_input_ptr[val_id]); + + // get the bucket for the bit-ordered input value, applying + // the offset and mask for radix bits + const auto mapped_val = + order_preserving_cast( + proj_op(in_val)); + std::uint32_t bucket_id = + get_bucket_id(mapped_val, radix_offset); + + OffsetT new_offset_id = 0; + for (std::uint32_t radix_state_id = 0; + radix_state_id < radix_states; ++radix_state_id) { + bool is_current_bucket = (bucket_id == radix_state_id); + std::uint32_t sg_total_offset = + peer_prefix_hlp.peer_contribution( + /* modified by reference */ new_offset_id, + offset_arr[radix_state_id], + /* bit contribution from this work-item */ + is_current_bucket); + offset_arr[radix_state_id] += sg_total_offset; + } + b_output_ptr[new_offset_id] = std::move(in_val); + } + } + else { + for (std::size_t val_id = seg_start + lid; val_id < seg_end; + val_id += sg_size) { + ValueT in_val = std::move(b_input_ptr[val_id]); + + // get the bucket for the bit-ordered input value, applying + // the offset and mask for radix bits + const auto mapped_val = + order_preserving_cast( + proj_op(in_val)); + std::uint32_t bucket_id = + get_bucket_id(mapped_val, radix_offset); + + OffsetT new_offset_id = 0; + for (std::uint32_t radix_state_id = 0; + radix_state_id < radix_states; ++radix_state_id) { + bool is_current_bucket = (bucket_id == radix_state_id); + std::uint32_t sg_total_offset = + peer_prefix_hlp.peer_contribution( + /* modified by reference */ new_offset_id, + offset_arr[radix_state_id], + /* bit contribution from this work-item */ + is_current_bucket); + offset_arr[radix_state_id] += sg_total_offset; + } + b_output_ptr[new_offset_id] = std::move(in_val); + } + } + if (tail_size > 0) { + ValueT in_val; + + // default: is greater than any actual radix state + std::uint32_t bucket_id = radix_states; + if (lid < tail_size) { + in_val = std::move(b_input_ptr[seg_end + lid]); + + const auto proj_val = proj_op(in_val); + const auto mapped_val = + (is_ascending) + ? order_preserving_cast( + proj_val) + : order_preserving_cast( + proj_val); + bucket_id = + get_bucket_id(mapped_val, radix_offset); + } + + OffsetT new_offset_id = 0; + for (std::uint32_t radix_state_id = 0; + radix_state_id < radix_states; ++radix_state_id) { + bool is_current_bucket = (bucket_id == radix_state_id); + std::uint32_t sg_total_offset = + peer_prefix_hlp.peer_contribution( + new_offset_id, offset_arr[radix_state_id], + is_current_bucket); + + offset_arr[radix_state_id] += sg_total_offset; + } + + if (lid < tail_size) { + b_output_ptr[new_offset_id] = std::move(in_val); + } + } + }); + }); + + return reorder_ev; +} + +template +sizeT _slm_adjusted_work_group_size(sycl::queue &exec_q, + sizeT required_slm_bytes_per_wg, + sizeT wg_size) +{ + const auto &dev = exec_q.get_device(); + + if (wg_size == 0) + wg_size = + dev.template get_info(); + + const auto local_mem_sz = + dev.template get_info(); + + return sycl::min(local_mem_sz / required_slm_bytes_per_wg, wg_size); +} + +//----------------------------------------------------------------------- +// radix sort: one iteration +//----------------------------------------------------------------------- + +template +struct parallel_radix_sort_iteration_step +{ + template + using count_phase = radix_sort_count_kernel; + template + using local_scan_phase = radix_sort_scan_kernel; + template + using reorder_peer_phase = + radix_sort_reorder_peer_kernel; + template + using reorder_phase = radix_sort_reorder_kernel; + + template + static sycl::event submit(sycl::queue &exec_q, + std::size_t n_iters, + std::size_t n_segments, + std::uint32_t radix_iter, + std::size_t n_values, + const InputT *in_ptr, + OutputT *out_ptr, + std::size_t n_counts, + CountT *counts_ptr, + const ProjT &proj_op, + const bool is_ascending, + const std::vector &dependency_events) + { + using _RadixCountKernel = count_phase; + using _RadixLocalScanKernel = + local_scan_phase; + using _RadixReorderPeerKernel = + reorder_peer_phase; + using _RadixReorderKernel = + reorder_phase; + + const auto &supported_sub_group_sizes = + exec_q.get_device() + .template get_info(); + const std::size_t max_sg_size = + (supported_sub_group_sizes.empty() + ? 0 + : supported_sub_group_sizes.back()); + const std::size_t reorder_sg_size = max_sg_size; + const std::size_t scan_wg_size = + exec_q.get_device() + .template get_info(); + + static constexpr std::size_t two_mils = (std::size_t(1) << 21); + std::size_t count_wg_size = + ((max_sg_size > 0) && (n_values > two_mils) ? 128 : max_sg_size); + + static constexpr std::uint32_t radix_states = std::uint32_t(1) + << radix_bits; + + // correct count_wg_size according to local memory limit in count phase + const auto max_count_wg_size = _slm_adjusted_work_group_size( + exec_q, sizeof(CountT) * radix_states, count_wg_size); + count_wg_size = + static_cast<::std::size_t>((max_count_wg_size / radix_states)) * + radix_states; + + // work-group size must be a power of 2 and not less than the number of + // states, for scanning to work correctly + + const std::size_t rounded_down_count_wg_size = + std::size_t{1} << (number_of_bits_in_type() - + sycl::clz(count_wg_size) - 1); + count_wg_size = + sycl::max(rounded_down_count_wg_size, std::size_t(radix_states)); + + // Compute the radix position for the given iteration + std::uint32_t radix_offset = radix_iter * radix_bits; + + // 1. Count Phase + sycl::event count_ev = + radix_sort_count_submit<_RadixCountKernel, radix_bits>( + exec_q, n_iters, n_segments, count_wg_size, radix_offset, + n_values, in_ptr, n_counts, counts_ptr, proj_op, is_ascending, + dependency_events); + + // 2. Scan Phase + sycl::event scan_ev = + radix_sort_scan_submit<_RadixLocalScanKernel, radix_bits>( + exec_q, n_iters, n_segments, scan_wg_size, n_values, n_counts, + counts_ptr, {count_ev}); + + // 3. Reorder Phase + sycl::event reorder_ev{}; + // subgroup_ballot-based peer algo uses extract_bits to populate + // uint32_t mask and hence relies on sub-group to be 32 or narrower + static constexpr std::size_t sg32_v = 32u; + static constexpr std::size_t sg16_v = 16u; + static constexpr std::size_t sg08_v = 8u; + if (sg32_v == reorder_sg_size || sg16_v == reorder_sg_size || + sg08_v == reorder_sg_size) { + static constexpr auto peer_algorithm = + peer_prefix_algo::subgroup_ballot; + + reorder_ev = radix_sort_reorder_submit<_RadixReorderPeerKernel, + radix_bits, peer_algorithm>( + exec_q, n_iters, n_segments, radix_offset, n_values, in_ptr, + out_ptr, n_counts, counts_ptr, proj_op, is_ascending, + {scan_ev}); + } + else { + static constexpr auto peer_algorithm = + peer_prefix_algo::scan_then_broadcast; + + reorder_ev = radix_sort_reorder_submit<_RadixReorderKernel, + radix_bits, peer_algorithm>( + exec_q, n_iters, n_segments, radix_offset, n_values, in_ptr, + out_ptr, n_counts, counts_ptr, proj_op, is_ascending, + {scan_ev}); + } + + return reorder_ev; + } +}; // struct parallel_radix_sort_iteration + +template +class radix_sort_one_wg_krn; + +template +struct subgroup_radix_sort +{ +private: + class use_slm_tag + { + }; + class use_global_mem_tag + { + }; + +public: + template + sycl::event operator()(sycl::queue &exec_q, + std::size_t n_iters, + std::size_t n_to_sort, + ValueT *input_ptr, + OutputT *output_ptr, + ProjT proj_op, + const bool is_ascending, + const std::vector &depends) + { + static_assert(std::is_same_v, OutputT>); + + using _SortKernelLoc = + radix_sort_one_wg_krn; + using _SortKernelPartGlob = + radix_sort_one_wg_krn; + using _SortKernelGlob = + radix_sort_one_wg_krn; + + static constexpr std::size_t max_concurrent_work_groups = 128U; + + // Choose this to occupy the entire accelerator + const std::size_t n_work_groups = + std::min(n_iters, max_concurrent_work_groups); + + // determine which temporary allocation can be accommodated in SLM + const auto &SLM_availability = + check_slm_size(exec_q, n_to_sort); + + const std::size_t n_batch_size = n_work_groups; + + switch (SLM_availability) { + case temp_allocations::both_in_slm: + { + static constexpr auto storage_for_values = use_slm_tag{}; + static constexpr auto storage_for_counters = use_slm_tag{}; + + return one_group_submitter<_SortKernelLoc>()( + exec_q, n_iters, n_iters, n_to_sort, input_ptr, output_ptr, + proj_op, is_ascending, storage_for_values, storage_for_counters, + depends); + } + case temp_allocations::counters_in_slm: + { + static constexpr auto storage_for_values = use_global_mem_tag{}; + static constexpr auto storage_for_counters = use_slm_tag{}; + + return one_group_submitter<_SortKernelPartGlob>()( + exec_q, n_iters, n_batch_size, n_to_sort, input_ptr, output_ptr, + proj_op, is_ascending, storage_for_values, storage_for_counters, + depends); + } + default: + { + static constexpr auto storage_for_values = use_global_mem_tag{}; + static constexpr auto storage_for_counters = use_global_mem_tag{}; + + return one_group_submitter<_SortKernelGlob>()( + exec_q, n_iters, n_batch_size, n_to_sort, input_ptr, output_ptr, + proj_op, is_ascending, storage_for_values, storage_for_counters, + depends); + } + } + } + +private: + template + class TempBuf; + + template + class TempBuf + { + std::size_t buf_size; + + public: + TempBuf(std::size_t, std::size_t n) : buf_size(n) {} + auto get_acc(sycl::handler &cgh) + { + return sycl::local_accessor(buf_size, cgh); + } + + std::size_t get_iter_stride() const { return std::size_t{0}; } + }; + + template + class TempBuf + { + sycl::buffer buf; + std::size_t iter_stride; + + public: + TempBuf(std::size_t n_iters, std::size_t n) + : buf(n_iters * n), iter_stride(n) + { + } + auto get_acc(sycl::handler &cgh) + { + return sycl::accessor(buf, cgh, sycl::read_write, sycl::no_init); + } + std::size_t get_iter_stride() const { return iter_stride; } + }; + + static_assert(wg_size <= 1024); + static constexpr std::uint16_t bin_count = (1 << radix); + static constexpr std::uint16_t counter_buf_sz = wg_size * bin_count + 1; + + enum class temp_allocations + { + both_in_slm, + counters_in_slm, + both_in_global_mem + }; + + template + temp_allocations check_slm_size(const sycl::queue &exec_q, SizeT n) + { + // the kernel is designed for data size <= 64K + assert(n <= (SizeT(1) << 16)); + + static constexpr auto req_slm_size_counters = + counter_buf_sz * sizeof(std::uint16_t); + + const auto &dev = exec_q.get_device(); + + // Pessimistically only use half of the memory to take into account + // a SYCL group algorithm might use a portion of SLM + const std::size_t max_slm_size = + dev.template get_info() / 2; + + const auto n_uniform = 1 << ceil_log2(n); + const auto req_slm_size_val = sizeof(T) * n_uniform; + + return ((req_slm_size_val + req_slm_size_counters) <= max_slm_size) + ? + // the values and the counters are placed in SLM + temp_allocations::both_in_slm + : (req_slm_size_counters <= max_slm_size) + ? + // the counters are placed in SLM, the values - in the + // global memory + temp_allocations::counters_in_slm + : + // the values and the counters are placed in the global + // memory + temp_allocations::both_in_global_mem; + } + + template + struct one_group_submitter + { + template + sycl::event operator()(sycl::queue &exec_q, + std::size_t n_iters, + std::size_t n_batch_size, + std::size_t n_values, + InputT *input_arr, + OutputT *output_arr, + const ProjT &proj_op, + const bool is_ascending, + SLM_value_tag, + SLM_counter_tag, + const std::vector &depends) + { + assert(!(n_values >> 16)); + + assert(n_values <= static_cast(block_size) * + static_cast(wg_size)); + + const std::uint16_t n = static_cast(n_values); + static_assert(std::is_same_v, OutputT>); + + using ValueT = OutputT; + + using KeyT = std::invoke_result_t; + + TempBuf buf_val( + n_batch_size, static_cast(block_size * wg_size)); + TempBuf buf_count( + n_batch_size, static_cast(counter_buf_sz)); + + sycl::range<1> lRange{wg_size}; + + sycl::event sort_ev; + std::vector deps{depends}; + + const std::size_t n_batches = + (n_iters + n_batch_size - 1) / n_batch_size; + + const auto &kernel_id = sycl::get_kernel_id(); + + auto const &ctx = exec_q.get_context(); + auto const &dev = exec_q.get_device(); + auto kb = sycl::get_kernel_bundle( + ctx, {dev}, {kernel_id}); + + const auto &krn = kb.get_kernel(kernel_id); + + const std::uint32_t krn_sg_size = krn.template get_info< + sycl::info::kernel_device_specific::max_sub_group_size>(dev); + + // due to a bug in CPU device implementation, an additional + // synchronization is necessary for short sub-group sizes + const bool work_around_needed = + exec_q.get_device().has(sycl::aspect::cpu) && + (krn_sg_size < 16); + + for (std::size_t batch_id = 0; batch_id < n_batches; ++batch_id) { + + const std::size_t block_start = batch_id * n_batch_size; + + // input_arr/output_arr each has shape (n_iters, n) + InputT *this_input_arr = input_arr + block_start * n_values; + OutputT *this_output_arr = output_arr + block_start * n_values; + + const std::size_t block_end = + std::min(block_start + n_batch_size, n_iters); + + sycl::range<1> gRange{(block_end - block_start) * wg_size}; + sycl::nd_range ndRange{gRange, lRange}; + + sort_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(deps); + cgh.use_kernel_bundle(kb); + + // allocation to use for value exchanges + auto exchange_acc = buf_val.get_acc(cgh); + const std::size_t exchange_acc_iter_stride = + buf_val.get_iter_stride(); + + // allocation for counters + auto counter_acc = buf_count.get_acc(cgh); + const std::size_t counter_acc_iter_stride = + buf_count.get_iter_stride(); + + cgh.parallel_for(ndRange, [=](sycl::nd_item<1> + ndit) { + ValueT values[block_size]; + + const std::size_t iter_id = ndit.get_group(0); + const std::size_t iter_val_offset = + iter_id * static_cast(n); + const std::size_t iter_counter_offset = + iter_id * counter_acc_iter_stride; + const std::size_t iter_exchange_offset = + iter_id * exchange_acc_iter_stride; + + std::uint16_t wi = ndit.get_local_linear_id(); + std::uint16_t begin_bit = 0; + + static constexpr std::uint16_t end_bit = + number_of_bits_in_type(); + + // copy from input array into values +#pragma unroll + for (std::uint16_t i = 0; i < block_size; ++i) { + const std::uint16_t id = wi * block_size + i; + values[i] = + (id < n) ? this_input_arr[iter_val_offset + id] + : ValueT{}; + } + + while (true) { + // indices for indirect access in the "re-order" + // phase + std::uint16_t indices[block_size]; + { + // pointers to bucket's counters + std::uint16_t *counters[block_size]; + + // counting phase + auto pcounter = + get_accessor_pointer(counter_acc) + + (wi + iter_counter_offset); + + // initialize counters +#pragma unroll + for (std::uint16_t i = 0; i < bin_count; ++i) + pcounter[i * wg_size] = std::uint16_t{0}; + + sycl::group_barrier(ndit.get_group()); + + if (is_ascending) { +#pragma unroll + for (std::uint16_t i = 0; i < block_size; + ++i) { + const std::uint16_t id = + wi * block_size + i; + static constexpr std::uint16_t + bin_mask = bin_count - 1; + + // points to the padded element, i.e. id + // is in-range + static constexpr std::uint16_t + default_out_of_range_bin_id = + bin_mask; + + const std::uint16_t bin = + (id < n) + ? get_bucket_id( + order_preserving_cast< + /* is_ascending */ + true>( + proj_op(values[i])), + begin_bit) + : default_out_of_range_bin_id; + + // counting and local offset calculation + counters[i] = &pcounter[bin * wg_size]; + indices[i] = *counters[i]; + *counters[i] = indices[i] + 1; + + if (work_around_needed) { + sycl::group_barrier( + ndit.get_group()); + } + } + } + else { +#pragma unroll + for (std::uint16_t i = 0; i < block_size; + ++i) { + const std::uint16_t id = + wi * block_size + i; + static constexpr std::uint16_t + bin_mask = bin_count - 1; + + // points to the padded element, i.e. id + // is in-range + static constexpr std::uint16_t + default_out_of_range_bin_id = + bin_mask; + + const std::uint16_t bin = + (id < n) + ? get_bucket_id( + order_preserving_cast< + /* is_ascending */ + false>( + proj_op(values[i])), + begin_bit) + : default_out_of_range_bin_id; + + // counting and local offset calculation + counters[i] = &pcounter[bin * wg_size]; + indices[i] = *counters[i]; + *counters[i] = indices[i] + 1; + + if (work_around_needed) { + sycl::group_barrier( + ndit.get_group()); + } + } + } + + sycl::group_barrier(ndit.get_group()); + + // exclusive scan phase + { + + // scan contiguous numbers + std::uint16_t bin_sum[bin_count]; + const std::size_t counter_offset0 = + iter_counter_offset + wi * bin_count; + bin_sum[0] = counter_acc[counter_offset0]; + +#pragma unroll + for (std::uint16_t i = 1; i < bin_count; + ++i) + bin_sum[i] = + bin_sum[i - 1] + + counter_acc[counter_offset0 + i]; + + sycl::group_barrier(ndit.get_group()); + + // exclusive scan local sum + std::uint16_t sum_scan = + sycl::exclusive_scan_over_group( + ndit.get_group(), + bin_sum[bin_count - 1], + sycl::plus()); + +// add to local sum, generate exclusive scan result +#pragma unroll + for (std::uint16_t i = 0; i < bin_count; + ++i) + counter_acc[counter_offset0 + i + 1] = + sum_scan + bin_sum[i]; + + if (wi == 0) + counter_acc[iter_counter_offset + 0] = + std::uint32_t{0}; + + sycl::group_barrier(ndit.get_group()); + } + +#pragma unroll + for (std::uint16_t i = 0; i < block_size; ++i) { + // a global index is a local offset plus a + // global base index + indices[i] += *counters[i]; + } + + sycl::group_barrier(ndit.get_group()); + } + + begin_bit += radix; + + // "re-order" phase + sycl::group_barrier(ndit.get_group()); + if (begin_bit >= end_bit) { + // the last iteration - writing out the result +#pragma unroll + for (std::uint16_t i = 0; i < block_size; ++i) { + const std::uint16_t r = indices[i]; + if (r < n) { + this_output_arr[iter_val_offset + r] = + values[i]; + } + } + + return; + } + + // data exchange +#pragma unroll + for (std::uint16_t i = 0; i < block_size; ++i) { + const std::uint16_t r = indices[i]; + if (r < n) + exchange_acc[iter_exchange_offset + r] = + values[i]; + } + + sycl::group_barrier(ndit.get_group()); + +#pragma unroll + for (std::uint16_t i = 0; i < block_size; ++i) { + const std::uint16_t id = wi * block_size + i; + if (id < n) + values[i] = + exchange_acc[iter_exchange_offset + id]; + } + + sycl::group_barrier(ndit.get_group()); + } + }); + }); + + deps = {sort_ev}; + } + + return sort_ev; + } + }; +}; + +template +struct OneWorkGroupRadixSortKernel; + +//----------------------------------------------------------------------- +// radix sort: main function +//----------------------------------------------------------------------- +template +sycl::event parallel_radix_sort_impl(sycl::queue &exec_q, + std::size_t n_iters, + std::size_t n_to_sort, + const ValueT *input_arr, + ValueT *output_arr, + const ProjT &proj_op, + const bool is_ascending, + const std::vector &depends) +{ + assert(n_to_sort > 1); + + using KeyT = std::remove_cv_t< + std::remove_reference_t>>; + + // radix bits represent number of processed bits in each value during one + // iteration + static constexpr std::uint32_t radix_bits = 4; + + sycl::event sort_ev{}; + + const auto &dev = exec_q.get_device(); + const auto max_wg_size = + dev.template get_info(); + + static constexpr std::uint16_t ref_wg_size = 64; + if (n_to_sort <= 16384 && ref_wg_size * 8 <= max_wg_size) { + using _RadixSortKernel = OneWorkGroupRadixSortKernel; + + if (n_to_sort <= 64 && ref_wg_size <= max_wg_size) { + // wg_size * block_size == 64 * 1 * 1 == 64 + static constexpr std::uint16_t wg_size = ref_wg_size; + static constexpr std::uint16_t block_size = 1; + + sort_ev = subgroup_radix_sort<_RadixSortKernel, wg_size, block_size, + radix_bits>{}( + exec_q, n_iters, n_to_sort, input_arr, output_arr, proj_op, + is_ascending, depends); + } + else if (n_to_sort <= 128 && ref_wg_size * 2 <= max_wg_size) { + // wg_size * block_size == 64 * 2 * 1 == 128 + static constexpr std::uint16_t wg_size = ref_wg_size * 2; + static constexpr std::uint16_t block_size = 1; + + sort_ev = subgroup_radix_sort<_RadixSortKernel, wg_size, block_size, + radix_bits>{}( + exec_q, n_iters, n_to_sort, input_arr, output_arr, proj_op, + is_ascending, depends); + } + else if (n_to_sort <= 256 && ref_wg_size * 2 <= max_wg_size) { + // wg_size * block_size == 64 * 2 * 2 == 256 + static constexpr std::uint16_t wg_size = ref_wg_size * 2; + static constexpr std::uint16_t block_size = 2; + + sort_ev = subgroup_radix_sort<_RadixSortKernel, wg_size, block_size, + radix_bits>{}( + exec_q, n_iters, n_to_sort, input_arr, output_arr, proj_op, + is_ascending, depends); + } + else if (n_to_sort <= 512 && ref_wg_size * 2 <= max_wg_size) { + // wg_size * block_size == 64 * 2 * 4 == 512 + static constexpr std::uint16_t wg_size = ref_wg_size * 2; + static constexpr std::uint16_t block_size = 4; + + sort_ev = subgroup_radix_sort<_RadixSortKernel, wg_size, block_size, + radix_bits>{}( + exec_q, n_iters, n_to_sort, input_arr, output_arr, proj_op, + is_ascending, depends); + } + else if (n_to_sort <= 1024 && ref_wg_size * 2 <= max_wg_size) { + // wg_size * block_size == 64 * 2 * 8 == 1024 + static constexpr std::uint16_t wg_size = ref_wg_size * 2; + static constexpr std::uint16_t block_size = 8; + + sort_ev = subgroup_radix_sort<_RadixSortKernel, wg_size, block_size, + radix_bits>{}( + exec_q, n_iters, n_to_sort, input_arr, output_arr, proj_op, + is_ascending, depends); + } + else if (n_to_sort <= 2048 && ref_wg_size * 4 <= max_wg_size) { + // wg_size * block_size == 64 * 4 * 8 == 2048 + static constexpr std::uint16_t wg_size = ref_wg_size * 4; + static constexpr std::uint16_t block_size = 8; + + sort_ev = subgroup_radix_sort<_RadixSortKernel, wg_size, block_size, + radix_bits>{}( + exec_q, n_iters, n_to_sort, input_arr, output_arr, proj_op, + is_ascending, depends); + } + else if (n_to_sort <= 4096 && ref_wg_size * 4 <= max_wg_size) { + // wg_size * block_size == 64 * 4 * 16 == 4096 + static constexpr std::uint16_t wg_size = ref_wg_size * 4; + static constexpr std::uint16_t block_size = 16; + + sort_ev = subgroup_radix_sort<_RadixSortKernel, wg_size, block_size, + radix_bits>{}( + exec_q, n_iters, n_to_sort, input_arr, output_arr, proj_op, + is_ascending, depends); + } + else if (n_to_sort <= 8192 && ref_wg_size * 8 <= max_wg_size) { + // wg_size * block_size == 64 * 8 * 16 == 8192 + static constexpr std::uint16_t wg_size = ref_wg_size * 8; + static constexpr std::uint16_t block_size = 16; + + sort_ev = subgroup_radix_sort<_RadixSortKernel, wg_size, block_size, + radix_bits>{}( + exec_q, n_iters, n_to_sort, input_arr, output_arr, proj_op, + is_ascending, depends); + } + else { + // wg_size * block_size == 64 * 8 * 32 == 16384 + static constexpr std::uint16_t wg_size = ref_wg_size * 8; + static constexpr std::uint16_t block_size = 32; + + sort_ev = subgroup_radix_sort<_RadixSortKernel, wg_size, block_size, + radix_bits>{}( + exec_q, n_iters, n_to_sort, input_arr, output_arr, proj_op, + is_ascending, depends); + } + } + else { + static constexpr std::uint32_t radix_iters = + number_of_buckets_in_type(radix_bits); + static constexpr std::uint32_t radix_states = std::uint32_t(1) + << radix_bits; + + static constexpr std::size_t bound_512k = (std::size_t(1) << 19); + static constexpr std::size_t bound_2m = (std::size_t(1) << 21); + + const auto wg_sz_k = (n_to_sort < bound_512k) ? 8 + : (n_to_sort <= bound_2m) ? 4 + : 1; + const std::size_t wg_size = max_wg_size / wg_sz_k; + + const std::size_t n_segments = (n_to_sort + wg_size - 1) / wg_size; + + // Additional radix_states elements are used for getting local offsets + // from count values + no_op flag; 'No operation' flag specifies whether + // to skip re-order phase if the all keys are the same (lie in one bin) + const std::size_t n_counts = + (n_segments + 1) * radix_states + 1 /*no_op flag*/; + + using CountT = std::uint32_t; + + // memory for storing count and offset values + auto count_owner = + dpctl::tensor::alloc_utils::smart_malloc_device( + n_iters * n_counts, exec_q); + + CountT *count_ptr = count_owner.get(); + + static constexpr std::uint32_t zero_radix_iter{0}; + + if constexpr (std::is_same_v) { + + sort_ev = parallel_radix_sort_iteration_step< + radix_bits, /*even=*/true>::submit(exec_q, n_iters, n_segments, + zero_radix_iter, n_to_sort, + input_arr, output_arr, + n_counts, count_ptr, proj_op, + is_ascending, depends); + + sort_ev = dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {sort_ev}, count_owner); + + return sort_ev; + } + + auto tmp_arr_owner = + dpctl::tensor::alloc_utils::smart_malloc_device( + n_iters * n_to_sort, exec_q); + + ValueT *tmp_arr = tmp_arr_owner.get(); + + // iterations per each bucket + assert("Number of iterations must be even" && radix_iters % 2 == 0); + assert(radix_iters > 0); + + sort_ev = parallel_radix_sort_iteration_step< + radix_bits, /*even=*/true>::submit(exec_q, n_iters, n_segments, + zero_radix_iter, n_to_sort, + input_arr, tmp_arr, n_counts, + count_ptr, proj_op, is_ascending, + depends); + + for (std::uint32_t radix_iter = 1; radix_iter < radix_iters; + ++radix_iter) { + if (radix_iter % 2 == 0) { + sort_ev = parallel_radix_sort_iteration_step< + radix_bits, + /*even=*/true>::submit(exec_q, n_iters, n_segments, + radix_iter, n_to_sort, output_arr, + tmp_arr, n_counts, count_ptr, + proj_op, is_ascending, {sort_ev}); + } + else { + sort_ev = parallel_radix_sort_iteration_step< + radix_bits, + /*even=*/false>::submit(exec_q, n_iters, n_segments, + radix_iter, n_to_sort, tmp_arr, + output_arr, n_counts, count_ptr, + proj_op, is_ascending, {sort_ev}); + } + } + + sort_ev = dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {sort_ev}, tmp_arr_owner, count_owner); + } + + return sort_ev; +} + +struct IdentityProj +{ + constexpr IdentityProj() {} + + template + constexpr T operator()(T val) const + { + return val; + } +}; + +template +struct ValueProj +{ + constexpr ValueProj() {} + + constexpr ValueT operator()(const std::pair &pair) const + { + return pair.first; + } +}; + +template +struct IndexedProj +{ + IndexedProj(const ValueT *arg_ptr) : ptr(arg_ptr), value_projector{} {} + + IndexedProj(const ValueT *arg_ptr, const ProjT &proj_op) + : ptr(arg_ptr), value_projector(proj_op) + { + } + + auto operator()(IndexT i) const { return value_projector(ptr[i]); } + +private: + const ValueT *ptr; + ProjT value_projector; +}; + +} // namespace radix_sort_details + +using dpctl::tensor::ssize_t; + +template +sycl::event + radix_sort_axis1_contig_impl(sycl::queue &exec_q, + const bool sort_ascending, + // number of sub-arrays to sort (num. of rows + // in a matrix when sorting over rows) + std::size_t iter_nelems, + // size of each array to sort (length of rows, + // i.e. number of columns) + std::size_t sort_nelems, + const char *arg_cp, + char *res_cp, + ssize_t iter_arg_offset, + ssize_t iter_res_offset, + ssize_t sort_arg_offset, + ssize_t sort_res_offset, + const std::vector &depends) +{ + const argTy *arg_tp = reinterpret_cast(arg_cp) + + iter_arg_offset + sort_arg_offset; + argTy *res_tp = + reinterpret_cast(res_cp) + iter_res_offset + sort_res_offset; + + using Proj = radix_sort_details::IdentityProj; + static constexpr Proj proj_op{}; + + sycl::event radix_sort_ev = + radix_sort_details::parallel_radix_sort_impl( + exec_q, iter_nelems, sort_nelems, arg_tp, res_tp, proj_op, + sort_ascending, depends); + + return radix_sort_ev; +} + +template +class radix_argsort_index_write_out_krn; + +template +class radix_argsort_iota_krn; + +template +sycl::event + radix_argsort_axis1_contig_impl(sycl::queue &exec_q, + const bool sort_ascending, + // number of sub-arrays to sort (num. of + // rows in a matrix when sorting over rows) + std::size_t iter_nelems, + // size of each array to sort (length of + // rows, i.e. number of columns) + std::size_t sort_nelems, + const char *arg_cp, + char *res_cp, + ssize_t iter_arg_offset, + ssize_t iter_res_offset, + ssize_t sort_arg_offset, + ssize_t sort_res_offset, + const std::vector &depends) +{ + const argTy *arg_tp = reinterpret_cast(arg_cp) + + iter_arg_offset + sort_arg_offset; + IndexTy *res_tp = + reinterpret_cast(res_cp) + iter_res_offset + sort_res_offset; + + const std::size_t total_nelems = iter_nelems * sort_nelems; + auto workspace_owner = + dpctl::tensor::alloc_utils::smart_malloc_device(total_nelems, + exec_q); + + // get raw USM pointer + IndexTy *workspace = workspace_owner.get(); + + using IdentityProjT = radix_sort_details::IdentityProj; + using IndexedProjT = + radix_sort_details::IndexedProj; + const IndexedProjT proj_op{arg_tp}; + + using IotaKernelName = radix_argsort_iota_krn; + + using dpctl::tensor::kernels::sort_utils_detail::iota_impl; + + sycl::event iota_ev = iota_impl( + exec_q, workspace, total_nelems, depends); + + sycl::event radix_sort_ev = + radix_sort_details::parallel_radix_sort_impl( + exec_q, iter_nelems, sort_nelems, workspace, res_tp, proj_op, + sort_ascending, {iota_ev}); + + using MapBackKernelName = radix_argsort_index_write_out_krn; + using dpctl::tensor::kernels::sort_utils_detail::map_back_impl; + + sycl::event dep = radix_sort_ev; + + // no need to perform map_back ( id % sort_nelems) + // if total_nelems == sort_nelems + if (iter_nelems > 1u) { + dep = map_back_impl( + exec_q, total_nelems, res_tp, res_tp, sort_nelems, {dep}); + } + + sycl::event cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {dep}, workspace_owner); + + return cleanup_ev; +} + +} // namespace dpctl::tensor::kernels diff --git a/dpnp/tensor/libtensor/include/kernels/sorting/search_sorted_detail.hpp b/dpnp/tensor/libtensor/include/kernels/sorting/search_sorted_detail.hpp new file mode 100644 index 000000000000..1f3576402511 --- /dev/null +++ b/dpnp/tensor/libtensor/include/kernels/sorting/search_sorted_detail.hpp @@ -0,0 +1,119 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for tensor sort/argsort operations. +//===----------------------------------------------------------------------===// + +#pragma once + +#include + +namespace dpctl::tensor::kernels::search_sorted_detail +{ + +template +T quotient_ceil(T n, T m) +{ + return (n + m - 1) / m; +} + +template +std::size_t lower_bound_impl(const Acc acc, + const std::size_t first, + const std::size_t last, + const Value &value, + const Compare &comp) +{ + std::size_t n = last - first; + std::size_t cur = n, start = first; + std::size_t it; + while (n > 0) { + it = start; + cur = n / 2; + it += cur; + if (comp(acc[it], value)) { + n -= cur + 1, start = ++it; + } + else + n = cur; + } + return start; +} + +template +std::size_t upper_bound_impl(const Acc acc, + const std::size_t first, + const std::size_t last, + const Value &value, + const Compare &comp) +{ + const auto &op_comp = [comp](auto x, auto y) { return !comp(y, x); }; + return lower_bound_impl(acc, first, last, value, op_comp); +} + +template +std::size_t lower_bound_indexed_impl(const Acc acc, + std::size_t first, + std::size_t last, + const Value &value, + const Compare &comp, + const IndexerT &acc_indexer) +{ + std::size_t n = last - first; + std::size_t cur = n, start = first; + std::size_t it; + while (n > 0) { + it = start; + cur = n / 2; + it += cur; + if (comp(acc[acc_indexer(it)], value)) { + n -= cur + 1, start = ++it; + } + else + n = cur; + } + return start; +} + +template +std::size_t upper_bound_indexed_impl(const Acc acc, + const std::size_t first, + const std::size_t last, + const Value &value, + const Compare &comp, + const IndexerT &acc_indexer) +{ + const auto &op_comp = [comp](auto x, auto y) { return !comp(y, x); }; + return lower_bound_indexed_impl(acc, first, last, value, op_comp, + acc_indexer); +} + +} // namespace dpctl::tensor::kernels::search_sorted_detail diff --git a/dpnp/tensor/libtensor/include/kernels/sorting/searchsorted.hpp b/dpnp/tensor/libtensor/include/kernels/sorting/searchsorted.hpp new file mode 100644 index 000000000000..bc400c9e569a --- /dev/null +++ b/dpnp/tensor/libtensor/include/kernels/sorting/searchsorted.hpp @@ -0,0 +1,258 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for tensor sort/argsort operations. +//===----------------------------------------------------------------------===// + +#pragma once + +#include +#include + +#include + +#include "kernels/dpctl_tensor_types.hpp" +#include "kernels/sorting/search_sorted_detail.hpp" +#include "utils/offset_utils.hpp" + +namespace dpctl::tensor::kernels +{ + +using dpctl::tensor::ssize_t; + +template +struct SearchSortedFunctor +{ +private: + const argTy *hay_tp; + const argTy *needles_tp; + indTy *positions_tp; + std::size_t hay_nelems; + HayIndexerT hay_indexer; + NeedlesIndexerT needles_indexer; + PositionsIndexerT positions_indexer; + +public: + SearchSortedFunctor(const argTy *hay_, + const argTy *needles_, + indTy *positions_, + const std::size_t hay_nelems_, + const HayIndexerT &hay_indexer_, + const NeedlesIndexerT &needles_indexer_, + const PositionsIndexerT &positions_indexer_) + : hay_tp(hay_), needles_tp(needles_), positions_tp(positions_), + hay_nelems(hay_nelems_), hay_indexer(hay_indexer_), + needles_indexer(needles_indexer_), + positions_indexer(positions_indexer_) + { + } + + void operator()(sycl::id<1> id) const + { + const Compare comp{}; + + const std::size_t i = id[0]; + const argTy needle_v = needles_tp[needles_indexer(i)]; + + // position of the needle_v in the hay array + indTy pos{}; + + static constexpr std::size_t zero(0); + if constexpr (left_side) { + // search in hay in left-closed interval, give `pos` such that + // hay[pos - 1] < needle_v <= hay[pos] + + // lower_bound returns the first pos such that bool(hay[pos] < + // needle_v) is false, i.e. needle_v <= hay[pos] + pos = search_sorted_detail::lower_bound_indexed_impl( + hay_tp, zero, hay_nelems, needle_v, comp, hay_indexer); + } + else { + // search in hay in right-closed interval: hay[pos - 1] <= needle_v + // < hay[pos] + + // upper_bound returns the first pos such that bool(needle_v < + // hay[pos]) is true, i.e. needle_v < hay[pos] + pos = search_sorted_detail::upper_bound_indexed_impl( + hay_tp, zero, hay_nelems, needle_v, comp, hay_indexer); + } + + positions_tp[positions_indexer(i)] = pos; + } +}; + +typedef sycl::event (*searchsorted_contig_impl_fp_ptr_t)( + sycl::queue &, + const std::size_t, + const std::size_t, + const char *, + const ssize_t, + const char *, + const ssize_t, + char *, + const ssize_t, + const std::vector &); + +template +class searchsorted_contig_impl_krn; + +template +sycl::event searchsorted_contig_impl(sycl::queue &exec_q, + const std::size_t hay_nelems, + const std::size_t needles_nelems, + const char *hay_cp, + const ssize_t hay_offset, + const char *needles_cp, + const ssize_t needles_offset, + char *positions_cp, + const ssize_t positions_offset, + const std::vector &depends) +{ + const argTy *hay_tp = reinterpret_cast(hay_cp) + hay_offset; + const argTy *needles_tp = + reinterpret_cast(needles_cp) + needles_offset; + + indTy *positions_tp = + reinterpret_cast(positions_cp) + positions_offset; + + sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + using KernelName = + class searchsorted_contig_impl_krn; + + sycl::range<1> gRange(needles_nelems); + + using TrivialIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + + static constexpr TrivialIndexerT hay_indexer{}; + static constexpr TrivialIndexerT needles_indexer{}; + static constexpr TrivialIndexerT positions_indexer{}; + + const auto fnctr = + SearchSortedFunctor( + hay_tp, needles_tp, positions_tp, hay_nelems, hay_indexer, + needles_indexer, positions_indexer); + + cgh.parallel_for(gRange, fnctr); + }); + + return comp_ev; +} + +typedef sycl::event (*searchsorted_strided_impl_fp_ptr_t)( + sycl::queue &, + const std::size_t, + const std::size_t, + const char *, + const ssize_t, + const ssize_t, + const char *, + const ssize_t, + char *, + const ssize_t, + int, + const ssize_t *, + const std::vector &); + +template +class searchsorted_strided_impl_krn; + +template +sycl::event searchsorted_strided_impl( + sycl::queue &exec_q, + const std::size_t hay_nelems, + const std::size_t needles_nelems, + const char *hay_cp, + const ssize_t hay_offset, + // hay is 1D, so hay_nelems, hay_offset, hay_stride describe strided array + const ssize_t hay_stride, + const char *needles_cp, + const ssize_t needles_offset, + char *positions_cp, + const ssize_t positions_offset, + const int needles_nd, + // packed_shape_strides is [needles_shape, needles_strides, + // positions_strides] has length of 3*needles_nd + const ssize_t *packed_shape_strides, + const std::vector &depends) +{ + const argTy *hay_tp = reinterpret_cast(hay_cp); + const argTy *needles_tp = reinterpret_cast(needles_cp); + + indTy *positions_tp = reinterpret_cast(positions_cp); + + sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + sycl::range<1> gRange(needles_nelems); + + using HayIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer; + const HayIndexerT hay_indexer( + /* offset */ hay_offset, + /* size */ hay_nelems, + /* step */ hay_stride); + + using NeedlesIndexerT = dpctl::tensor::offset_utils::StridedIndexer; + const ssize_t *needles_shape_strides = packed_shape_strides; + const NeedlesIndexerT needles_indexer(needles_nd, needles_offset, + needles_shape_strides); + using PositionsIndexerT = + dpctl::tensor::offset_utils::UnpackedStridedIndexer; + + const ssize_t *positions_shape = packed_shape_strides; + const ssize_t *positions_strides = + packed_shape_strides + 2 * needles_nd; + const PositionsIndexerT positions_indexer( + needles_nd, positions_offset, positions_shape, positions_strides); + + const auto fnctr = + SearchSortedFunctor( + hay_tp, needles_tp, positions_tp, hay_nelems, hay_indexer, + needles_indexer, positions_indexer); + using KernelName = + class searchsorted_strided_impl_krn; + + cgh.parallel_for(gRange, fnctr); + }); + + return comp_ev; +} + +} // namespace dpctl::tensor::kernels diff --git a/dpnp/tensor/libtensor/include/kernels/sorting/sort_impl_fn_ptr_t.hpp b/dpnp/tensor/libtensor/include/kernels/sorting/sort_impl_fn_ptr_t.hpp new file mode 100644 index 000000000000..7b48f310a445 --- /dev/null +++ b/dpnp/tensor/libtensor/include/kernels/sorting/sort_impl_fn_ptr_t.hpp @@ -0,0 +1,61 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_sorting_impl +/// extension. +//===----------------------------------------------------------------------===// + +#pragma once + +#include +#include + +#include + +#include "kernels/dpctl_tensor_types.hpp" + +namespace dpctl::tensor::kernels +{ + +using dpctl::tensor::ssize_t; + +typedef sycl::event (*sort_contig_fn_ptr_t)(sycl::queue &, + std::size_t, + std::size_t, + const char *, + char *, + ssize_t, + ssize_t, + ssize_t, + ssize_t, + const std::vector &); + +} // namespace dpctl::tensor::kernels diff --git a/dpnp/tensor/libtensor/include/kernels/sorting/sort_utils.hpp b/dpnp/tensor/libtensor/include/kernels/sorting/sort_utils.hpp new file mode 100644 index 000000000000..fd32905b808e --- /dev/null +++ b/dpnp/tensor/libtensor/include/kernels/sorting/sort_utils.hpp @@ -0,0 +1,144 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for tensor sort/argsort operations. +//===----------------------------------------------------------------------===// + +#pragma once + +#include +#include +#include +#include + +#include + +namespace dpctl::tensor::kernels::sort_utils_detail +{ + +namespace syclexp = sycl::ext::oneapi::experimental; + +template +sycl::event iota_impl(sycl::queue &exec_q, + T *data, + std::size_t nelems, + const std::vector &dependent_events) +{ + static constexpr std::uint32_t lws = 256; + static constexpr std::uint32_t n_wi = 4; + const std::size_t n_groups = (nelems + n_wi * lws - 1) / (n_wi * lws); + + sycl::range<1> gRange{n_groups * lws}; + sycl::range<1> lRange{lws}; + sycl::nd_range<1> ndRange{gRange, lRange}; + + sycl::event e = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(dependent_events); + cgh.parallel_for(ndRange, [=](sycl::nd_item<1> it) { + const std::size_t gid = it.get_global_linear_id(); + const auto &sg = it.get_sub_group(); + const std::uint32_t lane_id = sg.get_local_id()[0]; + + const std::size_t offset = (gid - lane_id) * n_wi; + const std::uint32_t max_sgSize = sg.get_max_local_range()[0]; + + std::array stripe{}; +#pragma unroll + for (std::uint32_t i = 0; i < n_wi; ++i) { + stripe[i] = T(offset + lane_id + i * max_sgSize); + } + + if (offset + n_wi * max_sgSize < nelems) { + static constexpr auto group_ls_props = + syclexp::properties{syclexp::data_placement_striped}; + + auto out_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>(&data[offset]); + + syclexp::group_store(sg, sycl::span{&stripe[0], n_wi}, + out_multi_ptr, group_ls_props); + } + else { + for (std::size_t idx = offset + lane_id; idx < nelems; + idx += max_sgSize) { + data[idx] = T(idx); + } + } + }); + }); + + return e; +} + +template +sycl::event map_back_impl(sycl::queue &exec_q, + std::size_t nelems, + const IndexTy *flat_index_data, + IndexTy *reduced_index_data, + std::size_t row_size, + const std::vector &dependent_events) +{ + static constexpr std::uint32_t lws = 64; + static constexpr std::uint32_t n_wi = 4; + const std::size_t n_groups = (nelems + lws * n_wi - 1) / (n_wi * lws); + + sycl::range<1> lRange{lws}; + sycl::range<1> gRange{n_groups * lws}; + sycl::nd_range<1> ndRange{gRange, lRange}; + + sycl::event map_back_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(dependent_events); + + cgh.parallel_for(ndRange, [=](sycl::nd_item<1> it) { + const std::size_t gid = it.get_global_linear_id(); + const auto &sg = it.get_sub_group(); + const std::uint32_t lane_id = sg.get_local_id()[0]; + const std::uint32_t sg_size = sg.get_max_local_range()[0]; + + const std::size_t start_id = (gid - lane_id) * n_wi + lane_id; + +#pragma unroll + for (std::uint32_t i = 0; i < n_wi; ++i) { + const std::size_t data_id = start_id + i * sg_size; + + if (data_id < nelems) { + const IndexTy linear_index = flat_index_data[data_id]; + reduced_index_data[data_id] = (linear_index % row_size); + } + } + }); + }); + + return map_back_ev; +} + +} // namespace dpctl::tensor::kernels::sort_utils_detail diff --git a/dpnp/tensor/libtensor/include/kernels/sorting/topk.hpp b/dpnp/tensor/libtensor/include/kernels/sorting/topk.hpp new file mode 100644 index 000000000000..1bbaa9e8345a --- /dev/null +++ b/dpnp/tensor/libtensor/include/kernels/sorting/topk.hpp @@ -0,0 +1,508 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for tensor topk operation. +//===----------------------------------------------------------------------===// + +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "kernels/sorting/merge_sort.hpp" +#include "kernels/sorting/radix_sort.hpp" +#include "kernels/sorting/search_sorted_detail.hpp" +#include "kernels/sorting/sort_utils.hpp" +#include "utils/sycl_alloc_utils.hpp" + +namespace dpctl::tensor::kernels +{ + +namespace topk_detail +{ + +void scale_topk_params(const std::uint64_t nelems_per_slm, + const std::size_t sub_groups_per_work_group, + const std::uint32_t elems_per_wi, + const std::vector &sg_sizes, + std::size_t &lws, + std::size_t &nelems_wg_sorts) +{ + for (auto it = sg_sizes.rbegin(); it != sg_sizes.rend(); ++it) { + auto sg_size = *it; + lws = sub_groups_per_work_group * sg_size; + nelems_wg_sorts = elems_per_wi * lws; + if (nelems_wg_sorts < nelems_per_slm) { + return; + } + } + // should never reach + throw std::runtime_error("Could not construct top k kernel parameters"); +} + +template +sycl::event write_out_impl(sycl::queue &exec_q, + std::size_t iter_nelems, + std::size_t k, + const argTy *arg_tp, + const IndexTy *index_data, + std::size_t iter_index_stride, + std::size_t axis_nelems, + argTy *vals_tp, + IndexTy *inds_tp, + const std::vector &depends) +{ + static constexpr std::uint32_t lws = 64; + static constexpr std::uint32_t n_wi = 4; + const std::size_t nelems = iter_nelems * k; + const std::size_t n_groups = (nelems + lws * n_wi - 1) / (n_wi * lws); + + sycl::range<1> lRange{lws}; + sycl::range<1> gRange{n_groups * lws}; + sycl::nd_range<1> ndRange{gRange, lRange}; + + sycl::event write_out_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + cgh.parallel_for(ndRange, [=](sycl::nd_item<1> it) { + const std::size_t gid = it.get_global_linear_id(); + const auto &sg = it.get_sub_group(); + const std::uint32_t lane_id = sg.get_local_id()[0]; + const std::uint32_t sg_size = sg.get_max_local_range()[0]; + + const std::size_t start_id = (gid - lane_id) * n_wi + lane_id; + +#pragma unroll + for (std::uint32_t i = 0; i < n_wi; ++i) { + const std::size_t data_id = start_id + i * sg_size; + + if (data_id < nelems) { + const std::size_t iter_id = data_id / k; + + /* + const std::size_t axis_gid = data_id - (iter_gid * k); + const std::size_t src_idx = iter_gid * iter_index_stride + + axis_gid; + */ + const std::size_t src_idx = + data_id + iter_id * (iter_index_stride - k); + + const IndexTy res_ind = index_data[src_idx]; + const argTy v = arg_tp[res_ind]; + + const std::size_t dst_idx = data_id; + vals_tp[dst_idx] = v; + inds_tp[dst_idx] = (res_ind % axis_nelems); + } + } + }); + }); + + return write_out_ev; +} + +} // namespace topk_detail + +template +class topk_populate_index_data_krn; + +template +class topk_full_merge_map_back_krn; + +template +sycl::event + topk_full_merge_sort_impl(sycl::queue &exec_q, + std::size_t iter_nelems, // number of sub-arrays + std::size_t axis_nelems, // size of each sub-array + std::size_t k, + const argTy *arg_tp, + argTy *vals_tp, + IndexTy *inds_tp, + const CompT &comp, + const std::vector &depends) +{ + auto index_data_owner = + dpctl::tensor::alloc_utils::smart_malloc_device( + iter_nelems * axis_nelems, exec_q); + // extract USM pointer + IndexTy *index_data = index_data_owner.get(); + + using IotaKernelName = topk_populate_index_data_krn; + + using dpctl::tensor::kernels::sort_utils_detail::iota_impl; + + sycl::event populate_indexed_data_ev = iota_impl( + exec_q, index_data, iter_nelems * axis_nelems, depends); + + std::size_t sorted_block_size; + // Sort segments of the array + sycl::event base_sort_ev = + merge_sort_detail::sort_over_work_group_contig_impl( + exec_q, iter_nelems, axis_nelems, index_data, index_data, comp, + sorted_block_size, // modified in place with size of sorted block + // size + {populate_indexed_data_ev}); + + // Merge segments in parallel until all elements are sorted + sycl::event merges_ev = merge_sort_detail::merge_sorted_block_contig_impl( + exec_q, iter_nelems, axis_nelems, index_data, comp, sorted_block_size, + {base_sort_ev}); + + using WriteOutKernelName = topk_full_merge_map_back_krn; + + sycl::event write_out_ev = + topk_detail::write_out_impl( + exec_q, iter_nelems, k, arg_tp, index_data, axis_nelems, + axis_nelems, vals_tp, inds_tp, {merges_ev}); + + sycl::event cleanup_host_task_event = + dpctl::tensor::alloc_utils::async_smart_free(exec_q, {write_out_ev}, + index_data_owner); + + return cleanup_host_task_event; +}; + +template +class topk_partial_merge_map_back_krn; + +template +class topk_over_work_group_krn; + +template > +sycl::event topk_merge_impl( + sycl::queue &exec_q, + std::size_t iter_nelems, // number of sub-arrays to sort (num. of rows + // in a matrix when sorting over rows) + std::size_t axis_nelems, // size of each array to sort (length of + // rows, i.e. number of columns) + std::size_t k, + const char *arg_cp, + char *vals_cp, + char *inds_cp, + const std::vector &depends) +{ + if (axis_nelems < k) { + throw std::runtime_error("Invalid sort axis size for value of k"); + } + + const argTy *arg_tp = reinterpret_cast(arg_cp); + argTy *vals_tp = reinterpret_cast(vals_cp); + IndexTy *inds_tp = reinterpret_cast(inds_cp); + + using dpctl::tensor::kernels::IndexComp; + const IndexComp index_comp{arg_tp, ValueComp{}}; + + if (axis_nelems <= 512 || k >= 1024 || k > axis_nelems / 2) { + return topk_full_merge_sort_impl(exec_q, iter_nelems, axis_nelems, k, + arg_tp, vals_tp, inds_tp, index_comp, + depends); + } + else { + using PartialKernelName = + topk_over_work_group_krn; + + const auto &kernel_id = sycl::get_kernel_id(); + + auto const &ctx = exec_q.get_context(); + auto const &dev = exec_q.get_device(); + + auto kb = sycl::get_kernel_bundle( + ctx, {dev}, {kernel_id}); + + auto krn = kb.get_kernel(kernel_id); + + const std::uint32_t max_sg_size = krn.template get_info< + sycl::info::kernel_device_specific::max_sub_group_size>(dev); + const std::uint64_t device_local_memory_size = + dev.get_info(); + + // leave 512 bytes of local memory for RT + const std::uint64_t safety_margin = 512; + + const std::uint64_t nelems_per_slm = + (device_local_memory_size - safety_margin) / (2 * sizeof(IndexTy)); + + static constexpr std::uint32_t sub_groups_per_work_group = 4; + const std::uint32_t elems_per_wi = dev.has(sycl::aspect::cpu) ? 8 : 2; + + std::size_t lws = sub_groups_per_work_group * max_sg_size; + + std::size_t sorted_block_size = elems_per_wi * lws; + if (sorted_block_size > nelems_per_slm) { + const std::vector sg_sizes = + dev.get_info(); + topk_detail::scale_topk_params( + nelems_per_slm, sub_groups_per_work_group, elems_per_wi, + sg_sizes, + lws, // modified by reference + sorted_block_size // modified by reference + ); + } + + // This assumption permits doing away with using a loop + assert(sorted_block_size % lws == 0); + + using search_sorted_detail::quotient_ceil; + const std::size_t n_segments = + quotient_ceil(axis_nelems, sorted_block_size); + + // round k up for the later merge kernel if necessary + const std::size_t round_k_to = dev.has(sycl::aspect::cpu) ? 32 : 4; + std::size_t k_rounded = + (k < round_k_to) + ? k + : quotient_ceil(k, round_k_to) * round_k_to; + + // get length of tail for alloc size + auto rem = axis_nelems % sorted_block_size; + auto alloc_len = (rem && rem < k_rounded) + ? rem + k_rounded * (n_segments - 1) + : k_rounded * n_segments; + + // if allocation would be sufficiently large or k is larger than + // elements processed, use full sort + if (k_rounded >= axis_nelems || k_rounded >= sorted_block_size || + alloc_len >= axis_nelems / 2) { + return topk_full_merge_sort_impl(exec_q, iter_nelems, axis_nelems, + k, arg_tp, vals_tp, inds_tp, + index_comp, depends); + } + + auto index_data_owner = + dpctl::tensor::alloc_utils::smart_malloc_device( + iter_nelems * alloc_len, exec_q); + // get raw USM pointer + IndexTy *index_data = index_data_owner.get(); + + // no need to populate index data: SLM will be populated with default + // values + + sycl::event base_sort_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + cgh.use_kernel_bundle(kb); + + sycl::range<1> global_range{iter_nelems * n_segments * lws}; + sycl::range<1> local_range{lws}; + + sycl::range<1> slm_range{sorted_block_size}; + sycl::local_accessor work_space(slm_range, cgh); + sycl::local_accessor scratch_space(slm_range, cgh); + + sycl::nd_range<1> ndRange(global_range, local_range); + + cgh.parallel_for( + ndRange, [=](sycl::nd_item<1> it) { + const std::size_t group_id = it.get_group_linear_id(); + const std::size_t iter_id = group_id / n_segments; + const std::size_t segment_id = + group_id - iter_id * n_segments; + const std::size_t lid = it.get_local_linear_id(); + + const std::size_t segment_start_idx = + segment_id * sorted_block_size; + const std::size_t segment_end_idx = std::min( + segment_start_idx + sorted_block_size, axis_nelems); + const std::size_t wg_chunk_size = + segment_end_idx - segment_start_idx; + + // load input into SLM + for (std::size_t array_id = segment_start_idx + lid; + array_id < segment_end_idx; array_id += lws) { + IndexTy v = (array_id < axis_nelems) + ? iter_id * axis_nelems + array_id + : IndexTy{}; + work_space[array_id - segment_start_idx] = v; + } + sycl::group_barrier(it.get_group()); + + const std::size_t chunk = + quotient_ceil(sorted_block_size, lws); + + const std::size_t chunk_start_idx = lid * chunk; + const std::size_t chunk_end_idx = + sycl::min(chunk_start_idx + chunk, wg_chunk_size); + + merge_sort_detail::leaf_sort_impl( + work_space, chunk_start_idx, chunk_end_idx, index_comp); + + sycl::group_barrier(it.get_group()); + + bool data_in_temp = false; + std::size_t n_chunks_merged = 1; + + // merge chunk while n_chunks_merged * chunk < wg_chunk_size + const std::size_t max_chunks_merged = + 1 + ((wg_chunk_size - 1) / chunk); + for (; n_chunks_merged < max_chunks_merged; + data_in_temp = !data_in_temp, n_chunks_merged *= 2) { + const std::size_t nelems_sorted_so_far = + n_chunks_merged * chunk; + const std::size_t q = (lid / n_chunks_merged); + const std::size_t start_1 = sycl::min( + 2 * nelems_sorted_so_far * q, wg_chunk_size); + const std::size_t end_1 = sycl::min( + start_1 + nelems_sorted_so_far, wg_chunk_size); + const std::size_t end_2 = sycl::min( + end_1 + nelems_sorted_so_far, wg_chunk_size); + const std::size_t offset = + chunk * (lid - q * n_chunks_merged); + + if (data_in_temp) { + merge_sort_detail::merge_impl( + offset, scratch_space, work_space, start_1, + end_1, end_2, start_1, index_comp, chunk); + } + else { + merge_sort_detail::merge_impl( + offset, work_space, scratch_space, start_1, + end_1, end_2, start_1, index_comp, chunk); + } + sycl::group_barrier(it.get_group()); + } + + // output assumed to be structured as (iter_nelems, + // alloc_len) + const std::size_t k_segment_start_idx = + segment_id * k_rounded; + const std::size_t k_segment_end_idx = std::min( + k_segment_start_idx + k_rounded, alloc_len); + const auto &out_src = + (data_in_temp) ? scratch_space : work_space; + for (std::size_t array_id = k_segment_start_idx + lid; + array_id < k_segment_end_idx; array_id += lws) { + if (lid < k_rounded) { + index_data[iter_id * alloc_len + array_id] = + out_src[array_id - k_segment_start_idx]; + } + } + }); + }); + + // Merge segments in parallel until all elements are sorted + sycl::event merges_ev = + merge_sort_detail::merge_sorted_block_contig_impl( + exec_q, iter_nelems, alloc_len, index_data, index_comp, + k_rounded, {base_sort_ev}); + + // Write out top k of the merge-sorted memory + using WriteOutKernelName = + topk_partial_merge_map_back_krn; + + sycl::event write_topk_ev = + topk_detail::write_out_impl( + exec_q, iter_nelems, k, arg_tp, index_data, alloc_len, + axis_nelems, vals_tp, inds_tp, {merges_ev}); + + sycl::event cleanup_host_task_event = + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {write_topk_ev}, index_data_owner); + + return cleanup_host_task_event; + } +} + +template +class topk_iota_krn; + +template +class topk_radix_map_back_krn; + +template +sycl::event topk_radix_impl(sycl::queue &exec_q, + std::size_t iter_nelems, // number of sub-arrays + std::size_t axis_nelems, // size of each sub-array + std::size_t k, + bool ascending, + const char *arg_cp, + char *vals_cp, + char *inds_cp, + const std::vector &depends) +{ + if (axis_nelems < k) { + throw std::runtime_error("Invalid sort axis size for value of k"); + } + + const argTy *arg_tp = reinterpret_cast(arg_cp); + argTy *vals_tp = reinterpret_cast(vals_cp); + IndexTy *inds_tp = reinterpret_cast(inds_cp); + + const std::size_t total_nelems = iter_nelems * axis_nelems; + const std::size_t padded_total_nelems = ((total_nelems + 63) / 64) * 64; + auto workspace_owner = + dpctl::tensor::alloc_utils::smart_malloc_device( + padded_total_nelems + total_nelems, exec_q); + + // get raw USM pointer + IndexTy *workspace = workspace_owner.get(); + IndexTy *tmp_tp = workspace + padded_total_nelems; + + using IdentityProjT = radix_sort_details::IdentityProj; + using IndexedProjT = + radix_sort_details::IndexedProj; + const IndexedProjT proj_op{arg_tp}; + + using IotaKernelName = topk_iota_krn; + + using dpctl::tensor::kernels::sort_utils_detail::iota_impl; + + sycl::event iota_ev = iota_impl( + exec_q, workspace, total_nelems, depends); + + sycl::event radix_sort_ev = + radix_sort_details::parallel_radix_sort_impl( + exec_q, iter_nelems, axis_nelems, workspace, tmp_tp, proj_op, + ascending, {iota_ev}); + + // Write out top k of the temporary + using WriteOutKernelName = topk_radix_map_back_krn; + + sycl::event write_topk_ev = + topk_detail::write_out_impl( + exec_q, iter_nelems, k, arg_tp, tmp_tp, axis_nelems, axis_nelems, + vals_tp, inds_tp, {radix_sort_ev}); + + sycl::event cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {write_topk_ev}, workspace_owner); + + return cleanup_ev; +} + +} // namespace dpctl::tensor::kernels diff --git a/dpnp/tensor/libtensor/include/kernels/where.hpp b/dpnp/tensor/libtensor/include/kernels/where.hpp new file mode 100644 index 000000000000..5527cccec8d2 --- /dev/null +++ b/dpnp/tensor/libtensor/include/kernels/where.hpp @@ -0,0 +1,336 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for dpctl.tensor.where. +//===---------------------------------------------------------------------===// + +#pragma once +#include +#include +#include +#include + +#include + +#include "dpctl_tensor_types.hpp" +#include "kernels/alignment.hpp" +#include "utils/offset_utils.hpp" +#include "utils/sycl_utils.hpp" +#include "utils/type_utils.hpp" + +namespace dpctl::tensor::kernels::search +{ + +using dpctl::tensor::ssize_t; +using namespace dpctl::tensor::offset_utils; + +using dpctl::tensor::kernels::alignment_utils:: + disabled_sg_loadstore_wrapper_krn; +using dpctl::tensor::kernels::alignment_utils::is_aligned; +using dpctl::tensor::kernels::alignment_utils::required_alignment; + +using dpctl::tensor::sycl_utils::sub_group_load; +using dpctl::tensor::sycl_utils::sub_group_store; + +template +class where_strided_kernel; +template +class where_contig_kernel; + +template +class WhereContigFunctor +{ +private: + std::size_t nelems = 0; + const condT *cond_p = nullptr; + const T *x1_p = nullptr; + const T *x2_p = nullptr; + T *dst_p = nullptr; + +public: + WhereContigFunctor(std::size_t nelems_, + const condT *cond_p_, + const T *x1_p_, + const T *x2_p_, + T *dst_p_) + : nelems(nelems_), cond_p(cond_p_), x1_p(x1_p_), x2_p(x2_p_), + dst_p(dst_p_) + { + } + + void operator()(sycl::nd_item<1> ndit) const + { + static constexpr std::uint8_t nelems_per_wi = n_vecs * vec_sz; + + using dpctl::tensor::type_utils::is_complex; + if constexpr (!enable_sg_loadstore || is_complex::value || + is_complex::value) { + const std::uint16_t sgSize = + ndit.get_sub_group().get_local_range()[0]; + const std::size_t gid = ndit.get_global_linear_id(); + + const std::uint16_t nelems_per_sg = sgSize * nelems_per_wi; + const std::size_t start = + (gid / sgSize) * (nelems_per_sg - sgSize) + gid; + const std::size_t end = std::min(nelems, start + nelems_per_sg); + for (std::size_t offset = start; offset < end; offset += sgSize) { + using dpctl::tensor::type_utils::convert_impl; + const bool check = convert_impl(cond_p[offset]); + dst_p[offset] = check ? x1_p[offset] : x2_p[offset]; + } + } + else { + auto sg = ndit.get_sub_group(); + const std::uint16_t sgSize = sg.get_max_local_range()[0]; + + const std::size_t base = + nelems_per_wi * (ndit.get_group(0) * ndit.get_local_range(0) + + sg.get_group_id()[0] * sgSize); + + if (base + nelems_per_wi * sgSize < nelems) { + sycl::vec dst_vec; + +#pragma unroll + for (std::uint8_t it = 0; it < n_vecs * vec_sz; it += vec_sz) { + const std::size_t idx = base + it * sgSize; + auto x1_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>(&x1_p[idx]); + auto x2_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>(&x2_p[idx]); + auto cond_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>(&cond_p[idx]); + auto dst_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>(&dst_p[idx]); + + const sycl::vec x1_vec = + sub_group_load(sg, x1_multi_ptr); + const sycl::vec x2_vec = + sub_group_load(sg, x2_multi_ptr); + const sycl::vec cond_vec = + sub_group_load(sg, cond_multi_ptr); +#pragma unroll + for (std::uint8_t k = 0; k < vec_sz; ++k) { + dst_vec[k] = cond_vec[k] ? x1_vec[k] : x2_vec[k]; + } + sub_group_store(sg, dst_vec, dst_multi_ptr); + } + } + else { + const std::size_t lane_id = sg.get_local_id()[0]; + for (std::size_t k = base + lane_id; k < nelems; k += sgSize) { + dst_p[k] = cond_p[k] ? x1_p[k] : x2_p[k]; + } + } + } + } +}; + +typedef sycl::event (*where_contig_impl_fn_ptr_t)( + sycl::queue &, + std::size_t, + const char *, + const char *, + const char *, + char *, + const std::vector &); + +template +sycl::event where_contig_impl(sycl::queue &q, + std::size_t nelems, + const char *cond_cp, + const char *x1_cp, + const char *x2_cp, + char *dst_cp, + const std::vector &depends) +{ + const condT *cond_tp = reinterpret_cast(cond_cp); + const T *x1_tp = reinterpret_cast(x1_cp); + const T *x2_tp = reinterpret_cast(x2_cp); + T *dst_tp = reinterpret_cast(dst_cp); + + sycl::event where_ev = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + std::size_t lws = 64; + static constexpr std::uint8_t vec_sz = 4u; + static constexpr std::uint8_t n_vecs = 2u; + const std::size_t n_groups = + ((nelems + lws * n_vecs * vec_sz - 1) / (lws * n_vecs * vec_sz)); + const auto gws_range = sycl::range<1>(n_groups * lws); + const auto lws_range = sycl::range<1>(lws); + + if (is_aligned(cond_cp) && + is_aligned(x1_cp) && + is_aligned(x2_cp) && + is_aligned(dst_cp)) { + static constexpr bool enable_sg_loadstore = true; + using KernelName = where_contig_kernel; + + cgh.parallel_for( + sycl::nd_range<1>(gws_range, lws_range), + WhereContigFunctor(nelems, cond_tp, x1_tp, + x2_tp, dst_tp)); + } + else { + static constexpr bool disable_sg_loadstore = false; + using InnerKernelName = + where_contig_kernel; + using KernelName = + disabled_sg_loadstore_wrapper_krn; + + cgh.parallel_for( + sycl::nd_range<1>(gws_range, lws_range), + WhereContigFunctor(nelems, cond_tp, x1_tp, + x2_tp, dst_tp)); + } + }); + + return where_ev; +} + +template +class WhereStridedFunctor +{ +private: + const T *x1_p = nullptr; + const T *x2_p = nullptr; + T *dst_p = nullptr; + const condT *cond_p = nullptr; + IndexerT indexer; + +public: + WhereStridedFunctor(const condT *cond_p_, + const T *x1_p_, + const T *x2_p_, + T *dst_p_, + const IndexerT &indexer_) + : x1_p(x1_p_), x2_p(x2_p_), dst_p(dst_p_), cond_p(cond_p_), + indexer(indexer_) + { + } + + void operator()(sycl::id<1> id) const + { + std::size_t gid = id[0]; + auto offsets = indexer(static_cast(gid)); + + using dpctl::tensor::type_utils::convert_impl; + bool check = + convert_impl(cond_p[offsets.get_first_offset()]); + + dst_p[offsets.get_fourth_offset()] = + check ? x1_p[offsets.get_second_offset()] + : x2_p[offsets.get_third_offset()]; + } +}; + +typedef sycl::event (*where_strided_impl_fn_ptr_t)( + sycl::queue &, + std::size_t, + int, + const char *, + const char *, + const char *, + char *, + const ssize_t *, + ssize_t, + ssize_t, + ssize_t, + ssize_t, + const std::vector &); + +template +sycl::event where_strided_impl(sycl::queue &q, + std::size_t nelems, + int nd, + const char *cond_cp, + const char *x1_cp, + const char *x2_cp, + char *dst_cp, + const ssize_t *shape_strides, + ssize_t x1_offset, + ssize_t x2_offset, + ssize_t cond_offset, + ssize_t dst_offset, + const std::vector &depends) +{ + const condT *cond_tp = reinterpret_cast(cond_cp); + const T *x1_tp = reinterpret_cast(x1_cp); + const T *x2_tp = reinterpret_cast(x2_cp); + T *dst_tp = reinterpret_cast(dst_cp); + + sycl::event where_ev = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + const FourOffsets_StridedIndexer indexer{ + nd, cond_offset, x1_offset, x2_offset, dst_offset, shape_strides}; + + cgh.parallel_for< + where_strided_kernel>( + sycl::range<1>(nelems), + WhereStridedFunctor( + cond_tp, x1_tp, x2_tp, dst_tp, indexer)); + }); + + return where_ev; +} + +template +struct WhereStridedFactory +{ + fnT get() + { + fnT fn = where_strided_impl; + return fn; + } +}; + +template +struct WhereContigFactory +{ + fnT get() + { + fnT fn = where_contig_impl; + return fn; + } +}; + +} // namespace dpctl::tensor::kernels::search diff --git a/dpnp/tensor/libtensor/include/utils/indexing_utils.hpp b/dpnp/tensor/libtensor/include/utils/indexing_utils.hpp new file mode 100644 index 000000000000..d28c8174c39c --- /dev/null +++ b/dpnp/tensor/libtensor/include/utils/indexing_utils.hpp @@ -0,0 +1,153 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +/// +/// \file +/// This file defines utilities for handling out-of-bounds integer indices in +/// kernels that involve indexing operations, such as take, put, or advanced +/// tensor integer indexing. +//===----------------------------------------------------------------------===// + +#pragma once + +#include +#include +#include +#include + +#include + +#include "kernels/dpctl_tensor_types.hpp" + +namespace dpctl::tensor::indexing_utils +{ +using dpctl::tensor::ssize_t; + +/* + * ssize_t for indices is a design choice, dpctl::tensor::usm_ndarray + * uses py::ssize_t for shapes and strides internally and Python uses + * py_ssize_t for sizes of e.g. lists. + */ + +template +struct WrapIndex +{ + static_assert(std::is_integral_v); + + ssize_t operator()(ssize_t max_item, IndT ind) const + { + ssize_t projected; + static constexpr ssize_t unit(1); + max_item = sycl::max(max_item, unit); + + static constexpr std::uintmax_t ind_max = + std::numeric_limits::max(); + static constexpr std::uintmax_t ssize_max = + std::numeric_limits::max(); + + if constexpr (std::is_signed_v) { + static constexpr std::intmax_t ind_min = + std::numeric_limits::min(); + static constexpr std::intmax_t ssize_min = + std::numeric_limits::min(); + + if constexpr (ind_max <= ssize_max && ind_min >= ssize_min) { + const ssize_t ind_ = static_cast(ind); + const ssize_t lb = -max_item; + const ssize_t ub = max_item - 1; + projected = sycl::clamp(ind_, lb, ub); + } + else { + const IndT lb = static_cast(-max_item); + const IndT ub = static_cast(max_item - 1); + projected = static_cast(sycl::clamp(ind, lb, ub)); + } + return (projected < 0) ? projected + max_item : projected; + } + else { + if constexpr (ind_max <= ssize_max) { + const ssize_t ind_ = static_cast(ind); + const ssize_t ub = max_item - 1; + projected = sycl::min(ind_, ub); + } + else { + const IndT ub = static_cast(max_item - 1); + projected = static_cast(sycl::min(ind, ub)); + } + return projected; + } + } +}; + +template +struct ClipIndex +{ + static_assert(std::is_integral_v); + + ssize_t operator()(ssize_t max_item, IndT ind) const + { + ssize_t projected; + static constexpr ssize_t unit(1); + max_item = sycl::max(max_item, unit); + + static constexpr std::uintmax_t ind_max = + std::numeric_limits::max(); + static constexpr std::uintmax_t ssize_max = + std::numeric_limits::max(); + if constexpr (std::is_signed_v) { + static constexpr std::intmax_t ind_min = + std::numeric_limits::min(); + static constexpr std::intmax_t ssize_min = + std::numeric_limits::min(); + + if constexpr (ind_max <= ssize_max && ind_min >= ssize_min) { + const ssize_t ind_ = static_cast(ind); + static constexpr ssize_t lb(0); + const ssize_t ub = max_item - 1; + projected = sycl::clamp(ind_, lb, ub); + } + else { + static constexpr IndT lb(0); + const IndT ub = static_cast(max_item - 1); + projected = static_cast(sycl::clamp(ind, lb, ub)); + } + } + else { + if constexpr (ind_max <= ssize_max) { + const ssize_t ind_ = static_cast(ind); + const ssize_t ub = max_item - 1; + projected = sycl::min(ind_, ub); + } + else { + const IndT ub = static_cast(max_item - 1); + projected = static_cast(sycl::min(ind, ub)); + } + } + return projected; + } +}; +} // namespace dpctl::tensor::indexing_utils diff --git a/dpnp/tensor/libtensor/include/utils/math_utils.hpp b/dpnp/tensor/libtensor/include/utils/math_utils.hpp new file mode 100644 index 000000000000..d35eff0074dc --- /dev/null +++ b/dpnp/tensor/libtensor/include/utils/math_utils.hpp @@ -0,0 +1,148 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +/// +/// \file +/// This file defines math utility functions. +//===----------------------------------------------------------------------===// + +#pragma once + +#include +#include +#include + +#include + +namespace dpctl::tensor::math_utils +{ +template +bool less_complex(const T &x1, const T &x2) +{ + using realT = typename T::value_type; + realT real1 = std::real(x1); + realT real2 = std::real(x2); + realT imag1 = std::imag(x1); + realT imag2 = std::imag(x2); + + return (real1 == real2) + ? (imag1 < imag2) + : (real1 < real2 && !std::isnan(imag1) && !std::isnan(imag2)); +} + +template +bool greater_complex(const T &x1, const T &x2) +{ + using realT = typename T::value_type; + realT real1 = std::real(x1); + realT real2 = std::real(x2); + realT imag1 = std::imag(x1); + realT imag2 = std::imag(x2); + + return (real1 == real2) + ? (imag1 > imag2) + : (real1 > real2 && !std::isnan(imag1) && !std::isnan(imag2)); +} + +template +bool less_equal_complex(const T &x1, const T &x2) +{ + using realT = typename T::value_type; + realT real1 = std::real(x1); + realT real2 = std::real(x2); + realT imag1 = std::imag(x1); + realT imag2 = std::imag(x2); + + return (real1 == real2) + ? (imag1 <= imag2) + : (real1 < real2 && !std::isnan(imag1) && !std::isnan(imag2)); +} + +template +bool greater_equal_complex(const T &x1, const T &x2) +{ + using realT = typename T::value_type; + realT real1 = std::real(x1); + realT real2 = std::real(x2); + realT imag1 = std::imag(x1); + realT imag2 = std::imag(x2); + + return (real1 == real2) + ? (imag1 >= imag2) + : (real1 > real2 && !std::isnan(imag1) && !std::isnan(imag2)); +} + +template +T max_complex(const T &x1, const T &x2) +{ + using realT = typename T::value_type; + realT real1 = std::real(x1); + realT real2 = std::real(x2); + realT imag1 = std::imag(x1); + realT imag2 = std::imag(x2); + + bool isnan_imag1 = std::isnan(imag1); + bool gt = (real1 == real2) + ? (imag1 > imag2) + : (real1 > real2 && !isnan_imag1 && !std::isnan(imag2)); + return (std::isnan(real1) || isnan_imag1 || gt) ? x1 : x2; +} + +template +T min_complex(const T &x1, const T &x2) +{ + using realT = typename T::value_type; + realT real1 = std::real(x1); + realT real2 = std::real(x2); + realT imag1 = std::imag(x1); + realT imag2 = std::imag(x2); + + bool isnan_imag1 = std::isnan(imag1); + bool lt = (real1 == real2) + ? (imag1 < imag2) + : (real1 < real2 && !isnan_imag1 && !std::isnan(imag2)); + return (std::isnan(real1) || isnan_imag1 || lt) ? x1 : x2; +} + +template +T logaddexp(T x, T y) +{ + if (x == y) { // handle signed infinities + const T log2 = sycl::log(T(2)); + return x + log2; + } + else { + const T tmp = x - y; + static constexpr T zero(0); + + return (tmp > zero) + ? (x + sycl::log1p(sycl::exp(-tmp))) + : ((tmp <= zero) ? y + sycl::log1p(sycl::exp(tmp)) + : std::numeric_limits::quiet_NaN()); + } +} +} // namespace dpctl::tensor::math_utils diff --git a/dpnp/tensor/libtensor/include/utils/memory_overlap.hpp b/dpnp/tensor/libtensor/include/utils/memory_overlap.hpp new file mode 100644 index 000000000000..b534e55b3192 --- /dev/null +++ b/dpnp/tensor/libtensor/include/utils/memory_overlap.hpp @@ -0,0 +1,157 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +/// +/// \file +/// This file defines utility to determine whether two arrays have memory +/// overlap. +//===----------------------------------------------------------------------===// + +#pragma once + +#include +#include + +#include + +#include "dpnp4pybind11.hpp" + +/* @brief check for overlap of memory regions behind arrays. + +Presently assume that array occupies all bytes between smallest and largest +displaced elements. + +TODO: Write proper Frobenius solver to account for holes, e.g. + overlap( x_contig[::2], x_contig[1::2]) should give False, + while this implementation gives True. +*/ +namespace dpctl::tensor::overlap +{ +namespace py = pybind11; + +struct MemoryOverlap +{ + bool operator()(dpctl::tensor::usm_ndarray ar1, + dpctl::tensor::usm_ndarray ar2) const + { + const char *ar1_data = ar1.get_data(); + + const auto &ar1_offsets = ar1.get_minmax_offsets(); + py::ssize_t ar1_elem_size = + static_cast(ar1.get_elemsize()); + + const char *ar2_data = ar2.get_data(); + const auto &ar2_offsets = ar2.get_minmax_offsets(); + py::ssize_t ar2_elem_size = + static_cast(ar2.get_elemsize()); + + /* Memory of array1 extends from */ + /* [ar1_data + ar1_offsets.first * ar1_elem_size, ar1_data + + * ar1_offsets.second * ar1_elem_size + ar1_elem_size] */ + /* Memory of array2 extends from */ + /* [ar2_data + ar2_offsets.first * ar2_elem_size, ar2_data + + * ar2_offsets.second * ar2_elem_size + ar2_elem_size] */ + + /* Intervals [x0, x1] and [y0, y1] do not overlap if (x0 <= x1) && (y0 + * <= y1) + * && (x1 <=y0 || y1 <= x0 ) */ + /* Given that x0 <= x1 and y0 <= y1 are true by construction, the + * condition for overlap us (x1 > y0) && (y1 > x0) */ + + /* Applying: + (ar1_data + ar1_offsets.second * ar1_elem_size + ar1_elem_size > + ar2_data + + ar2_offsets.first * ar2_elem_size) && (ar2_data + ar2_offsets.second * + ar2_elem_size + ar2_elem_size > ar1_data + ar1_offsets.first * + ar1_elem_size) + */ + + auto byte_distance = static_cast(ar2_data - ar1_data); + + py::ssize_t x1_minus_y0 = + (-byte_distance + + (ar1_elem_size + (ar1_offsets.second * ar1_elem_size) - + (ar2_offsets.first * ar2_elem_size))); + + py::ssize_t y1_minus_x0 = + (byte_distance + + (ar2_elem_size + (ar2_offsets.second * ar2_elem_size) - + (ar1_offsets.first * ar1_elem_size))); + + bool memory_overlap = (x1_minus_y0 > 0) && (y1_minus_x0 > 0); + + return memory_overlap; + } +}; + +struct SameLogicalTensors +{ + bool operator()(dpctl::tensor::usm_ndarray ar1, + dpctl::tensor::usm_ndarray ar2) const + { + // Same ndim + int nd1 = ar1.get_ndim(); + if (nd1 != ar2.get_ndim()) + return false; + + // Same dtype + int tn1 = ar1.get_typenum(); + if (tn1 != ar2.get_typenum()) + return false; + + // Same pointer + const char *ar1_data = ar1.get_data(); + const char *ar2_data = ar2.get_data(); + + if (ar1_data != ar2_data) + return false; + + // Same shape and strides + const py::ssize_t *ar1_shape = ar1.get_shape_raw(); + const py::ssize_t *ar2_shape = ar2.get_shape_raw(); + + if (!std::equal(ar1_shape, ar1_shape + nd1, ar2_shape)) + return false; + + // Same shape and strides + auto const &ar1_strides = ar1.get_strides_vector(); + auto const &ar2_strides = ar2.get_strides_vector(); + + auto ar1_beg_it = std::begin(ar1_strides); + auto ar1_end_it = std::end(ar1_strides); + + auto ar2_beg_it = std::begin(ar2_strides); + + if (!std::equal(ar1_beg_it, ar1_end_it, ar2_beg_it)) + return false; + + // all checks passed: arrays are logical views + // into the same memory + return true; + } +}; +} // namespace dpctl::tensor::overlap diff --git a/dpnp/tensor/libtensor/include/utils/offset_utils.hpp b/dpnp/tensor/libtensor/include/utils/offset_utils.hpp new file mode 100644 index 000000000000..3a6ac75dfc3a --- /dev/null +++ b/dpnp/tensor/libtensor/include/utils/offset_utils.hpp @@ -0,0 +1,788 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +/// +/// \file +/// This file defines Indexer callable operator to compute element offset in +/// an array addressed by gloabl_id. +//===----------------------------------------------------------------------===// + +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "kernels/dpctl_tensor_types.hpp" +#include "utils/strided_iters.hpp" +#include "utils/sycl_alloc_utils.hpp" + +namespace dpctl::tensor::offset_utils +{ +namespace detail +{ +struct sink_t +{ + sink_t() {}; + template + sink_t(T &&) {}; +}; + +template +std::size_t __accumulate_size(std::size_t &s, V &&v) +{ + return s += v.size(); +} + +template +sink_t __appender(V &lhs, U &&rhs) +{ + lhs.insert(lhs.end(), rhs.begin(), rhs.end()); + return {}; +} + +template +std::vector concat(std::vector lhs, Vs &&...vs) +{ + std::size_t s = lhs.size(); + { + // limited scope ensures array is freed + [[maybe_unused]] sink_t tmp[] = {__accumulate_size(s, vs)..., 0}; + } + lhs.reserve(s); + { + // array of no-data objects ensures ordering of calls to the appender + [[maybe_unused]] sink_t tmp[] = { + __appender(lhs, std::forward(vs))..., 0}; + } + + return std::move(lhs); // prevent return-value optimization +} +} // namespace detail + +template +std::tuple, + std::size_t, + sycl::event> + device_allocate_and_pack(sycl::queue &q, + std::vector &host_task_events, + Vs &&...vs) +{ + + using dpctl::tensor::alloc_utils::usm_host_allocator; + + // memory transfer optimization, use USM-host for temporary speeds up + // transfer to device, especially on dGPUs + using usm_host_allocatorT = usm_host_allocator; + using shT = std::vector; + + usm_host_allocatorT usm_host_alloc(q); + shT empty{0, usm_host_alloc}; + shT packed_shape_strides = detail::concat(std::move(empty), vs...); + + auto packed_shape_strides_owner = + std::make_shared(std::move(packed_shape_strides)); + + auto sz = packed_shape_strides_owner->size(); + auto shape_strides_owner = + dpctl::tensor::alloc_utils::smart_malloc_device(sz, q); + indT *shape_strides = shape_strides_owner.get(); + + sycl::event copy_ev = + q.copy(packed_shape_strides_owner->data(), shape_strides, sz); + + sycl::event cleanup_host_task_ev = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(copy_ev); + cgh.host_task([packed_shape_strides_owner = + std::move(packed_shape_strides_owner)] { + // increment shared pointer ref-count to keep it alive + // till copy operation completes; + }); + }); + host_task_events.push_back(cleanup_host_task_ev); + + return std::make_tuple(std::move(shape_strides_owner), sz, copy_ev); +} + +struct NoOpIndexer +{ + constexpr NoOpIndexer() {} + constexpr std::size_t operator()(std::size_t gid) const { return gid; } +}; + +using dpctl::tensor::ssize_t; + +/* @brief Indexer with shape and strides arrays of same size are packed */ +struct StridedIndexer +{ + StridedIndexer(int _nd, + ssize_t _offset, + ssize_t const *_packed_shape_strides) + : nd(_nd), starting_offset(_offset), + shape_strides(_packed_shape_strides) + { + } + + ssize_t operator()(ssize_t gid) const { return compute_offset(gid); } + + ssize_t operator()(std::size_t gid) const + { + return compute_offset(static_cast(gid)); + } + +private: + int nd; + ssize_t starting_offset; + ssize_t const *shape_strides; + + ssize_t compute_offset(ssize_t gid) const + { + using dpctl::tensor::strides::CIndexer_vector; + + CIndexer_vector _ind(nd); + ssize_t relative_offset(0); + _ind.get_displacement( + gid, + shape_strides, // shape ptr + shape_strides + nd, // strides ptr + relative_offset); + return starting_offset + relative_offset; + } +}; + +// ensure that indexer is device copyable +static_assert(sycl::is_device_copyable_v); + +/* @brief Indexer with shape, strides provided separately */ +struct UnpackedStridedIndexer +{ + UnpackedStridedIndexer(int _nd, + ssize_t _offset, + ssize_t const *_shape, + ssize_t const *_strides) + : nd(_nd), starting_offset(_offset), shape(_shape), strides(_strides) + { + } + + ssize_t operator()(ssize_t gid) const { return compute_offset(gid); } + + ssize_t operator()(std::size_t gid) const + { + return compute_offset(static_cast(gid)); + } + +private: + int nd; + ssize_t starting_offset; + ssize_t const *shape; + ssize_t const *strides; + + ssize_t compute_offset(ssize_t gid) const + { + using dpctl::tensor::strides::CIndexer_vector; + + CIndexer_vector _ind(nd); + ssize_t relative_offset(0); + _ind.get_displacement( + gid, + shape, // shape ptr + strides, // strides ptr + relative_offset); + return starting_offset + relative_offset; + } +}; + +// ensure that indexer is device copyable +static_assert(sycl::is_device_copyable_v); + +struct Strided1DIndexer +{ + Strided1DIndexer(std::size_t _size) : offset{}, size(_size), step(1) {} + Strided1DIndexer(ssize_t _size) + : offset{}, size(static_cast(_size)), step(1) + { + } + Strided1DIndexer(std::size_t _size, ssize_t _step) + : offset{}, size(_size), step(_step) + { + } + Strided1DIndexer(std::size_t _size, std::size_t _step) + : offset{}, size(_size), step(static_cast(_step)) + { + } + Strided1DIndexer(ssize_t _size, ssize_t _step) + : offset{}, size(static_cast(_size)), step(_step) + { + } + Strided1DIndexer(ssize_t _offset, std::size_t _size, ssize_t _step) + : offset(_offset), size(_size), step(_step) + { + } + Strided1DIndexer(ssize_t _offset, std::size_t _size, std::size_t _step) + : offset(_offset), size(_size), step(static_cast(_step)) + { + } + Strided1DIndexer(ssize_t _offset, ssize_t _size, ssize_t _step) + : offset(_offset), size(static_cast(_size)), step(_step) + { + } + + ssize_t operator()(std::size_t gid) const + { + // ensure 0 <= gid < size + return offset + std::min(gid, size - 1) * step; + } + +private: + ssize_t offset = 0; + std::size_t size = 1; + ssize_t step = 1; +}; + +static_assert(sycl::is_device_copyable_v); + +struct Strided1DCyclicIndexer +{ + Strided1DCyclicIndexer(ssize_t _offset, ssize_t _size, ssize_t _step) + : offset(_offset), size(static_cast(_size)), step(_step) + { + } + + ssize_t operator()(std::size_t gid) const + { + return offset + (gid % size) * step; + } + +private: + ssize_t offset = 0; + std::size_t size = 1; + ssize_t step = 1; +}; + +static_assert(sycl::is_device_copyable_v); + +template +struct TwoOffsets +{ + constexpr TwoOffsets() : first_offset(0), second_offset(0) {} + constexpr TwoOffsets(const displacementT &first_offset_, + const displacementT &second_offset_) + : first_offset(first_offset_), second_offset(second_offset_) + { + } + + constexpr displacementT get_first_offset() const { return first_offset; } + constexpr displacementT get_second_offset() const { return second_offset; } + +private: + displacementT first_offset = 0; + displacementT second_offset = 0; +}; + +struct TwoOffsets_StridedIndexer +{ + TwoOffsets_StridedIndexer(int common_nd, + ssize_t first_offset_, + ssize_t second_offset_, + ssize_t const *_packed_shape_strides) + : nd(common_nd), starting_first_offset(first_offset_), + starting_second_offset(second_offset_), + shape_strides(_packed_shape_strides) + { + } + + TwoOffsets operator()(ssize_t gid) const + { + return compute_offsets(gid); + } + + TwoOffsets operator()(std::size_t gid) const + { + return compute_offsets(static_cast(gid)); + } + +private: + int nd; + ssize_t starting_first_offset; + ssize_t starting_second_offset; + ssize_t const *shape_strides; + + TwoOffsets compute_offsets(ssize_t gid) const + { + using dpctl::tensor::strides::CIndexer_vector; + + CIndexer_vector _ind(nd); + ssize_t relative_first_offset(0); + ssize_t relative_second_offset(0); + _ind.get_displacement( + gid, + shape_strides, // shape ptr + shape_strides + nd, // strides ptr + shape_strides + 2 * nd, // strides ptr + relative_first_offset, relative_second_offset); + return TwoOffsets( + starting_first_offset + relative_first_offset, + starting_second_offset + relative_second_offset); + } +}; + +struct TwoZeroOffsets_Indexer +{ + constexpr TwoZeroOffsets_Indexer() {} + + constexpr TwoOffsets operator()(ssize_t) const + { + return TwoOffsets(); + } +}; + +static_assert(sycl::is_device_copyable_v); + +template +struct TwoOffsets_CombinedIndexer +{ +private: + FirstIndexerT first_indexer_; + SecondIndexerT second_indexer_; + +public: + constexpr TwoOffsets_CombinedIndexer(const FirstIndexerT &first_indexer, + const SecondIndexerT &second_indexer) + : first_indexer_(first_indexer), second_indexer_(second_indexer) + { + } + + constexpr TwoOffsets operator()(ssize_t gid) const + { + return TwoOffsets(first_indexer_(gid), second_indexer_(gid)); + } +}; + +template +struct ThreeOffsets +{ + constexpr ThreeOffsets() + : first_offset(0), second_offset(0), third_offset(0) + { + } + constexpr ThreeOffsets(const displacementT &first_offset_, + const displacementT &second_offset_, + const displacementT &third_offset_) + : first_offset(first_offset_), second_offset(second_offset_), + third_offset(third_offset_) + { + } + + constexpr displacementT get_first_offset() const { return first_offset; } + constexpr displacementT get_second_offset() const { return second_offset; } + constexpr displacementT get_third_offset() const { return third_offset; } + +private: + displacementT first_offset = 0; + displacementT second_offset = 0; + displacementT third_offset = 0; +}; + +struct ThreeOffsets_StridedIndexer +{ + ThreeOffsets_StridedIndexer(int common_nd, + ssize_t first_offset_, + ssize_t second_offset_, + ssize_t third_offset_, + ssize_t const *_packed_shape_strides) + : nd(common_nd), starting_first_offset(first_offset_), + starting_second_offset(second_offset_), + starting_third_offset(third_offset_), + shape_strides(_packed_shape_strides) + { + } + + ThreeOffsets operator()(ssize_t gid) const + { + return compute_offsets(gid); + } + + ThreeOffsets operator()(std::size_t gid) const + { + return compute_offsets(static_cast(gid)); + } + +private: + int nd; + ssize_t starting_first_offset; + ssize_t starting_second_offset; + ssize_t starting_third_offset; + ssize_t const *shape_strides; + + ThreeOffsets compute_offsets(ssize_t gid) const + { + using dpctl::tensor::strides::CIndexer_vector; + + CIndexer_vector _ind(nd); + ssize_t relative_first_offset(0); + ssize_t relative_second_offset(0); + ssize_t relative_third_offset(0); + _ind.get_displacement( + gid, + shape_strides, // shape ptr + shape_strides + nd, // strides ptr + shape_strides + 2 * nd, // strides ptr + shape_strides + 3 * nd, // strides ptr + relative_first_offset, relative_second_offset, + relative_third_offset); + return ThreeOffsets( + starting_first_offset + relative_first_offset, + starting_second_offset + relative_second_offset, + starting_third_offset + relative_third_offset); + } +}; + +static_assert(sycl::is_device_copyable_v); + +struct ThreeZeroOffsets_Indexer +{ + constexpr ThreeZeroOffsets_Indexer() {} + + constexpr ThreeOffsets operator()(ssize_t) const + { + return ThreeOffsets(); + } + + constexpr ThreeOffsets operator()(std::size_t) const + { + return ThreeOffsets(); + } +}; + +static_assert(sycl::is_device_copyable_v); + +template +struct ThreeOffsets_CombinedIndexer +{ +private: + FirstIndexerT first_indexer_; + SecondIndexerT second_indexer_; + ThirdIndexerT third_indexer_; + +public: + constexpr ThreeOffsets_CombinedIndexer(const FirstIndexerT &first_indexer, + const SecondIndexerT &second_indexer, + const ThirdIndexerT &third_indexer) + : first_indexer_(first_indexer), second_indexer_(second_indexer), + third_indexer_(third_indexer) + { + } + + constexpr ThreeOffsets operator()(ssize_t gid) const + { + return ThreeOffsets(first_indexer_(gid), second_indexer_(gid), + third_indexer_(gid)); + } +}; + +template +struct FourOffsets +{ + constexpr FourOffsets() + : first_offset(0), second_offset(0), third_offset(0), fourth_offset(0) + { + } + constexpr FourOffsets(const displacementT &first_offset_, + const displacementT &second_offset_, + const displacementT &third_offset_, + const displacementT &fourth_offset_) + : first_offset(first_offset_), second_offset(second_offset_), + third_offset(third_offset_), fourth_offset(fourth_offset_) + { + } + + constexpr displacementT get_first_offset() const { return first_offset; } + constexpr displacementT get_second_offset() const { return second_offset; } + constexpr displacementT get_third_offset() const { return third_offset; } + constexpr displacementT get_fourth_offset() const { return fourth_offset; } + +private: + displacementT first_offset = 0; + displacementT second_offset = 0; + displacementT third_offset = 0; + displacementT fourth_offset = 0; +}; + +struct FourOffsets_StridedIndexer +{ + constexpr FourOffsets_StridedIndexer(int common_nd, + ssize_t first_offset_, + ssize_t second_offset_, + ssize_t third_offset_, + ssize_t fourth_offset_, + ssize_t const *_packed_shape_strides) + : nd(common_nd), starting_first_offset(first_offset_), + starting_second_offset(second_offset_), + starting_third_offset(third_offset_), + starting_fourth_offset(fourth_offset_), + shape_strides(_packed_shape_strides) + { + } + + constexpr FourOffsets operator()(ssize_t gid) const + { + return compute_offsets(gid); + } + + constexpr FourOffsets operator()(std::size_t gid) const + { + return compute_offsets(static_cast(gid)); + } + +private: + int nd; + ssize_t starting_first_offset; + ssize_t starting_second_offset; + ssize_t starting_third_offset; + ssize_t starting_fourth_offset; + ssize_t const *shape_strides; + + FourOffsets compute_offsets(ssize_t gid) const + { + using dpctl::tensor::strides::CIndexer_vector; + + CIndexer_vector _ind(nd); + ssize_t relative_first_offset(0); + ssize_t relative_second_offset(0); + ssize_t relative_third_offset(0); + ssize_t relative_fourth_offset(0); + _ind.get_displacement( + gid, + shape_strides, // shape ptr + shape_strides + nd, // strides ptr + shape_strides + 2 * nd, // strides ptr + shape_strides + 3 * nd, // strides ptr + shape_strides + 4 * nd, // strides ptr + relative_first_offset, relative_second_offset, + relative_third_offset, relative_fourth_offset); + return FourOffsets( + starting_first_offset + relative_first_offset, + starting_second_offset + relative_second_offset, + starting_third_offset + relative_third_offset, + starting_fourth_offset + relative_fourth_offset); + } +}; + +static_assert(sycl::is_device_copyable_v); + +struct FourZeroOffsets_Indexer +{ + constexpr FourZeroOffsets_Indexer() {} + + constexpr FourOffsets operator()(ssize_t) const + { + return FourOffsets(); + } +}; + +static_assert(sycl::is_device_copyable_v); + +struct NthStrideOffset +{ + NthStrideOffset(int common_nd, + ssize_t const *_offsets, + ssize_t const *_packed_shape_strides) + : _ind(common_nd), nd(common_nd), offsets(_offsets), + shape_strides(_packed_shape_strides) + { + } + + std::size_t operator()(ssize_t gid, int n) const + { + ssize_t relative_offset(0); + _ind.get_displacement( + gid, shape_strides, shape_strides + ((n + 1) * nd), + relative_offset); + + return relative_offset + offsets[n]; + } + +private: + dpctl::tensor::strides::CIndexer_vector _ind; + + int nd; + ssize_t const *offsets; + ssize_t const *shape_strides; +}; + +static_assert(sycl::is_device_copyable_v); + +template +struct FixedDimStridedIndexer +{ + FixedDimStridedIndexer(const std::array &_shape, + const std::array &_strides, + ssize_t _offset) + : _ind(_shape), strides(_strides), starting_offset(_offset) + { + } + std::size_t operator()(std::size_t gid) const + { + dpctl::tensor::strides::CIndexer_array local_indexer( + std::move(_ind)); + local_indexer.set(gid); + auto mi = local_indexer.get(); + + ssize_t relative_offset = 0; + +#pragma unroll + for (int i = 0; i < nd; ++i) { + relative_offset += mi[i] * strides[i]; + } + return starting_offset + relative_offset; + } + +private: + dpctl::tensor::strides::CIndexer_array _ind; + + std::array strides; + ssize_t starting_offset; +}; + +static_assert(sycl::is_device_copyable_v>); + +template +struct TwoOffsets_FixedDimStridedIndexer +{ + TwoOffsets_FixedDimStridedIndexer(const std::array &_shape, + const std::array &_strides1, + const std::array &_strides2, + ssize_t _offset1, + ssize_t _offset2) + : _ind(_shape), strides1(_strides1), strides2(_strides2), + starting_offset1(_offset1), starting_offset2(_offset2) + { + } + + TwoOffsets operator()(std::size_t gid) const + { + dpctl::tensor::strides::CIndexer_array local_indexer( + std::move(_ind)); + local_indexer.set(gid); + auto mi = local_indexer.get(); + + ssize_t relative_offset1 = 0; +#pragma unroll + for (int i = 0; i < nd; ++i) { + relative_offset1 += mi[i] * strides1[i]; + } + + ssize_t relative_offset2 = 0; +#pragma unroll + for (int i = 0; i < nd; ++i) { + relative_offset2 += mi[i] * strides2[i]; + } + + return TwoOffsets(starting_offset1 + relative_offset1, + starting_offset2 + relative_offset2); + } + +private: + dpctl::tensor::strides::CIndexer_array _ind; + + std::array strides1; + std::array strides2; + ssize_t starting_offset1; + ssize_t starting_offset2; +}; + +static_assert(sycl::is_device_copyable_v>); + +template +struct ThreeOffsets_FixedDimStridedIndexer +{ + ThreeOffsets_FixedDimStridedIndexer( + const std::array &_shape, + const std::array &_strides1, + const std::array &_strides2, + const std::array &_strides3, + ssize_t _offset1, + ssize_t _offset2, + ssize_t _offset3) + : _ind(_shape), strides1(_strides1), strides2(_strides2), + strides3(_strides3), starting_offset1(_offset1), + starting_offset2(_offset2), starting_offset3(_offset3) + { + } + + ThreeOffsets operator()(std::size_t gid) const + { + dpctl::tensor::strides::CIndexer_array local_indexer( + std::move(_ind)); + local_indexer.set(gid); + auto mi = local_indexer.get(); + + ssize_t relative_offset1 = 0; +#pragma unroll + for (int i = 0; i < nd; ++i) { + relative_offset1 += mi[i] * strides1[i]; + } + + ssize_t relative_offset2 = 0; +#pragma unroll + for (int i = 0; i < nd; ++i) { + relative_offset2 += mi[i] * strides2[i]; + } + + ssize_t relative_offset3 = 0; +#pragma unroll + for (int i = 0; i < nd; ++i) { + relative_offset3 += mi[i] * strides3[i]; + } + + return ThreeOffsets(starting_offset1 + relative_offset1, + starting_offset2 + relative_offset2, + starting_offset3 + relative_offset3); + } + +private: + dpctl::tensor::strides::CIndexer_array _ind; + + std::array strides1; + std::array strides2; + std::array strides3; + ssize_t starting_offset1; + ssize_t starting_offset2; + ssize_t starting_offset3; +}; + +static_assert( + sycl::is_device_copyable_v>); +} // namespace dpctl::tensor::offset_utils diff --git a/dpnp/tensor/libtensor/include/utils/output_validation.hpp b/dpnp/tensor/libtensor/include/utils/output_validation.hpp new file mode 100644 index 000000000000..26f1b29bd3d8 --- /dev/null +++ b/dpnp/tensor/libtensor/include/utils/output_validation.hpp @@ -0,0 +1,79 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +/// +/// \file +/// This file defines utilities for determining if an array is a valid output +/// array. +//===----------------------------------------------------------------------===// + +#pragma once + +#include + +#include + +#include "dpnp4pybind11.hpp" + +namespace dpctl::tensor::validation +{ +namespace py = pybind11; + +/*! @brief Raises a value error if an array is read-only. + + This should be called with an array before writing.*/ +struct CheckWritable +{ + static void throw_if_not_writable(const dpctl::tensor::usm_ndarray &arr) + { + if (!arr.is_writable()) { + throw py::value_error("output array is read-only."); + } + return; + } +}; + +/*! @brief Raises a value error if an array's memory is not sufficiently ample + to accommodate an input number of elements. + + This should be called with an array before writing.*/ +struct AmpleMemory +{ + template + static void throw_if_not_ample(const dpctl::tensor::usm_ndarray &arr, + T nelems) + { + auto arr_offsets = arr.get_minmax_offsets(); + T range = static_cast(arr_offsets.second - arr_offsets.first); + if (range + 1 < nelems) { + throw py::value_error("Memory addressed by the output array is not " + "sufficiently ample."); + } + return; + } +}; +} // namespace dpctl::tensor::validation diff --git a/dpnp/tensor/libtensor/include/utils/rich_comparisons.hpp b/dpnp/tensor/libtensor/include/utils/rich_comparisons.hpp new file mode 100644 index 000000000000..5d03294392d8 --- /dev/null +++ b/dpnp/tensor/libtensor/include/utils/rich_comparisons.hpp @@ -0,0 +1,149 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_sorting_impl +/// extension. +//===---------------------------------------------------------------------===// + +#pragma once + +#include +#include +#include +#include + +#include "sycl/sycl.hpp" + +namespace dpctl::tensor::rich_comparisons +{ + +namespace detail +{ +template +struct ExtendedRealFPLess +{ + /* [R, nan] */ + bool operator()(const fpT v1, const fpT v2) const + { + return (!std::isnan(v1) && (std::isnan(v2) || (v1 < v2))); + } +}; + +template +struct ExtendedRealFPGreater +{ + bool operator()(const fpT v1, const fpT v2) const + { + return (!std::isnan(v2) && (std::isnan(v1) || (v2 < v1))); + } +}; + +template +struct ExtendedComplexFPLess +{ + /* [(R, R), (R, nan), (nan, R), (nan, nan)] */ + + bool operator()(const cT &v1, const cT &v2) const + { + using realT = typename cT::value_type; + + const realT real1 = std::real(v1); + const realT real2 = std::real(v2); + + const bool r1_nan = std::isnan(real1); + const bool r2_nan = std::isnan(real2); + + const realT imag1 = std::imag(v1); + const realT imag2 = std::imag(v2); + + const bool i1_nan = std::isnan(imag1); + const bool i2_nan = std::isnan(imag2); + + const int idx1 = ((r1_nan) ? 2 : 0) + ((i1_nan) ? 1 : 0); + const int idx2 = ((r2_nan) ? 2 : 0) + ((i2_nan) ? 1 : 0); + + const bool res = + !(r1_nan && i1_nan) && + ((idx1 < idx2) || + ((idx1 == idx2) && + ((r1_nan && !i1_nan && (imag1 < imag2)) || + (!r1_nan && i1_nan && (real1 < real2)) || + (!r1_nan && !i1_nan && + ((real1 < real2) || (!(real2 < real1) && (imag1 < imag2))))))); + + return res; + } +}; + +template +struct ExtendedComplexFPGreater +{ + bool operator()(const cT &v1, const cT &v2) const + { + auto less_ = ExtendedComplexFPLess{}; + return less_(v2, v1); + } +}; + +template +inline constexpr bool is_fp_v = + (std::is_same_v || std::is_same_v || + std::is_same_v); + +} // namespace detail + +template +struct AscendingSorter +{ + using type = std::conditional_t, + detail::ExtendedRealFPLess, + std::less>; +}; + +template +struct AscendingSorter> +{ + using type = detail::ExtendedComplexFPLess>; +}; + +template +struct DescendingSorter +{ + using type = std::conditional_t, + detail::ExtendedRealFPGreater, + std::greater>; +}; + +template +struct DescendingSorter> +{ + using type = detail::ExtendedComplexFPGreater>; +}; + +} // namespace dpctl::tensor::rich_comparisons diff --git a/dpnp/tensor/libtensor/include/utils/strided_iters.hpp b/dpnp/tensor/libtensor/include/utils/strided_iters.hpp new file mode 100644 index 000000000000..65250b755b56 --- /dev/null +++ b/dpnp/tensor/libtensor/include/utils/strided_iters.hpp @@ -0,0 +1,984 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +/// +/// \file +/// This file defines CIndexer_array, and CIndexer_vector classes, as well +/// iteration space simplifiers. +//===----------------------------------------------------------------------===// + +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +namespace dpctl::tensor::strides +{ +/* An N-dimensional array can be stored in a single + * contiguous chunk of memory by contiguously laying + * array elements in lexicographinc order of their + * array indices. Such a layout is called C-contiguous. + * + * E.g. for (2, 3, 2) array `a` with zero-based indexing convention + * the C-array's elements are + * { a[0,0,0], a[0,0,1], a[0,1,0], a[0,1,1], a[0,2,0], a[0,2,1], + * a[1,0,0], a[1,0,1], a[1,1,0], a[1,1,1], a[1,2,0], a[1,2,1] } + * + * Indexer maps zero-based index in C-array to a multi-index + * for the purpose of computing element displacement in the + * strided array, i.e. in the above example for k = 5, the displacement + * is (s0*0 + s1*2 + s2*1), and for k = 7 it is (s0*1 + s1*0 + s2*1) + * for N-dimensional array with strides (s0, s1, s2). + * + * Cindexer_vector need not know array rank `dim` at compile time. + * Shape and strides are stored in std::vector, which are not trivially + * copyable. + * + * For the class to be trivially copyable for offloading displacement + * computation methods take accessor/pointer arguments of type T for + * shape and stride and modify displacement argument passed by reference. + */ +template +class CIndexer_vector +{ + static_assert(std::is_integral::value, "Integral type is required"); + static_assert(std::is_signed::value, + "Signed integral type is required"); + int nd; + +public: + CIndexer_vector(int dim) : nd(dim) {} + + template + indT size(const ShapeTy &shape) const + { + indT s = static_cast(1); + for (int i = 0; i < nd; ++i) { + s *= shape[i]; + } + return s; + } + + template + void get_displacement(const indT i, + const ShapeTy &shape, + const StridesTy &stride, + indT &disp) const + { + if (nd == 1) { + disp = i * stride[0]; + return; + } + + indT i_ = i; + indT d = 0; + for (int dim = nd; --dim > 0;) { + const indT si = shape[dim]; + const indT q = i_ / si; + const indT r = (i_ - q * si); + d += r * stride[dim]; + i_ = q; + } + disp = d + i_ * stride[0]; + } + + template + void get_displacement(const indT i, + const ShapeTy &shape, + const StridesTy &stride1, + const StridesTy &stride2, + indT &disp1, + indT &disp2) const + { + if (nd == 1) { + disp1 = i * stride1[0]; + disp2 = i * stride2[0]; + return; + } + + indT i_ = i; + indT d1 = 0, d2 = 0; + for (int dim = nd; --dim > 0;) { + const indT si = shape[dim]; + const indT q = i_ / si; + const indT r = (i_ - q * si); + i_ = q; + d1 += r * stride1[dim]; + d2 += r * stride2[dim]; + } + disp1 = d1 + i_ * stride1[0]; + disp2 = d2 + i_ * stride2[0]; + return; + } + + template + void get_displacement(const indT i, + const ShapeTy &shape, + const StridesTy &stride1, + const StridesTy &stride2, + const StridesTy &stride3, + indT &disp1, + indT &disp2, + indT &disp3) const + { + if (nd == 1) { + disp1 = i * stride1[0]; + disp2 = i * stride2[0]; + disp3 = i * stride3[0]; + return; + } + + indT i_ = i; + indT d1 = 0, d2 = 0, d3 = 0; + for (int dim = nd; --dim > 0;) { + const indT si = shape[dim]; + const indT q = i_ / si; + const indT r = (i_ - q * si); + i_ = q; + d1 += r * stride1[dim]; + d2 += r * stride2[dim]; + d3 += r * stride3[dim]; + }; + disp1 = d1 + i_ * stride1[0]; + disp2 = d2 + i_ * stride2[0]; + disp3 = d3 + i_ * stride3[0]; + return; + } + + template + void get_displacement(const indT i, + const ShapeTy &shape, + const StridesTy &stride1, + const StridesTy &stride2, + const StridesTy &stride3, + const StridesTy &stride4, + indT &disp1, + indT &disp2, + indT &disp3, + indT &disp4) const + { + if (nd == 1) { + disp1 = i * stride1[0]; + disp2 = i * stride2[0]; + disp3 = i * stride3[0]; + disp4 = i * stride4[0]; + return; + } + + indT i_ = i; + indT d1 = 0, d2 = 0, d3 = 0, d4 = 0; + for (int dim = nd; --dim > 0;) { + const indT si = shape[dim]; + const indT q = i_ / si; + const indT r = (i_ - q * si); + i_ = q; + d1 += r * stride1[dim]; + d2 += r * stride2[dim]; + d3 += r * stride3[dim]; + d4 += r * stride4[dim]; + } + disp1 = d1 + i_ * stride1[0]; + disp2 = d2 + i_ * stride2[0]; + disp3 = d3 + i_ * stride3[0]; + disp4 = d4 + i_ * stride4[0]; + return; + } + + template + void get_displacement(const indT i, + const ShapeTy &shape, + const std::array &strides, + std::array &disps) const + { + if (nd == 1) { + for (int k = 0; k < nstrides; ++k) { + disps[k] = i * strides[k][0]; + } + return; + } + + indT i_ = i; + std::array ds; + for (int k = 0; k < nstrides; ++k) { + ds[k] = 0; + } + + for (int dim = nd; --dim > 0;) { + const indT si = shape[dim]; + const indT q = i_ / si; + const indT r = (i_ - q * si); + for (int k = 0; k < nstrides; ++k) { + ds[k] += r * strides[k][dim]; + } + i_ = q; + }; + for (int k = 0; k < nstrides; ++k) { + disps[k] = ds[k] + i_ * strides[k][0]; + } + return; + } + + template + void get_left_rolled_displacement(const indT i, + const ShapeTy &shape, + const StridesTy &stride, + const StridesTy &shifts, + indT &disp) const + { + indT i_ = i; + indT d(0); + for (int dim = nd; --dim > 0;) { + const indT si = shape[dim]; + const indT q = i_ / si; + const indT r = (i_ - q * si); + // assumes si > shifts[dim] >= 0 + const indT shifted_r = + (r < shifts[dim] ? r + si - shifts[dim] : r - shifts[dim]); + d += shifted_r * stride[dim]; + i_ = q; + } + const indT shifted_r = + (i_ < shifts[0] ? i_ + shape[0] - shifts[0] : i_ - shifts[0]); + disp = d + shifted_r * stride[0]; + } +}; + +/* + * CIndexer is for arrays whose array-rank is known at compile time. + * Statically allocated shape and multi_index arrays are members of + * the class instance, and it remains trivially copyable. + * + * Method `set(k)` populates work-item private array multi_index, which + * can be accessed using `get()` to compute the displacement as needed. + */ + +template +class CIndexer_array +{ + static constexpr int ndim = _ndim; + + static_assert(std::is_integral::value, "Integral type is required"); + static_assert(std::is_signed::value, + "Signed integral type is required"); + static_assert(ndim > 0, "Dimensionality must be positive"); + +private: + typedef std::array index_t; + + indT elem_count; + index_t shape; + index_t multi_index; + +public: + CIndexer_array() : elem_count(0), shape{}, multi_index{} {} + + explicit CIndexer_array(const index_t &input_shape) + : elem_count(0), shape{}, multi_index{} + { + indT s(1); + for (int i = 0; i < ndim; ++i) { + shape[i] = input_shape[i]; + s *= input_shape[i]; + } + elem_count = s; + } + + indT size() const { return elem_count; } + indT rank() const { return ndim; } + + void set(const indT i) + { + if (ndim == 1) { + multi_index[0] = i; + return; + } + + indT i_ = i; +#pragma unroll + for (int dim = ndim; --dim > 0;) { + indT si = shape[dim]; + indT q = i_ / si; + multi_index[dim] = i_ - q * si; + i_ = q; + } + multi_index[0] = i_; + } + + const index_t &get() const { return multi_index; } +}; + +/* + For purposes of iterating over elements of array with + `shape` and `strides` given as pointers + `simplify_iteration_strides(nd, shape_ptr, strides_ptr, disp)` + may modify memory and returns new length of these arrays. + + The new shape and new strides, as well as the offset + `(new_shape, new_strides, disp)` are such that iterating over + them will traverse the same elements, possibly in + different order. + + ..Example: python + import itertools + # for some array Y over whose elements we iterate + csh, cst, cp = contract_iter(Y.shape, Y.strides) + def pointers_set(sh, st, p): + citers = itertools.product(*map(lambda s: range(s), sh)) + dot = lambda st, it: sum(st[k]*it[k] for k in range(len(st))) + return set(p + dot(st, it) for it in citers) + ps1 = pointers_set(csh, cst, cp) + ps2 = pointers_set(Y.shape, Y.strides, 0) + assert ps1 == ps2 + + */ +template +int simplify_iteration_stride(const int nd, + ShapeTy *shape, + StridesTy *strides, + StridesTy &disp) +{ + disp = StridesTy(0); + if (nd < 2) + return nd; + + std::vector pos(nd); + std::iota(pos.begin(), pos.end(), 0); + + std::stable_sort( + pos.begin(), pos.end(), [&strides, &shape](int i1, int i2) { + auto abs_str1 = (strides[i1] < 0) ? -strides[i1] : strides[i1]; + auto abs_str2 = (strides[i2] < 0) ? -strides[i2] : strides[i2]; + return (abs_str1 > abs_str2) || + (abs_str1 == abs_str2 && shape[i1] > shape[i2]); + }); + + std::vector shape_w; + std::vector strides_w; + int nd_ = nd; + shape_w.reserve(nd_); + strides_w.reserve(nd_); + + for (int i = 0; i < nd; ++i) { + auto p = pos[i]; + auto sh_p = shape[p]; + auto str_p = strides[p]; + shape_w.push_back(sh_p); + if (str_p < 0) { + disp += str_p * (sh_p - 1); + str_p = -str_p; + } + strides_w.push_back(str_p); + } + + { + bool changed; + do { + changed = false; + for (int i = 0; i + 1 < nd_; ++i) { + StridesTy step = strides_w[i + 1]; + StridesTy jump = strides_w[i] - (shape_w[i + 1] - 1) * step; + if (jump == step) { + changed = true; + for (int k = i; k + 1 < nd_; ++k) { + strides_w[k] = strides_w[k + 1]; + } + shape_w[i] *= shape_w[i + 1]; + for (int k = i + 1; k + 1 < nd_; ++k) { + shape_w[k] = shape_w[k + 1]; + } + --nd_; + } + } + } while (changed); + } + + for (int i = 0; i < nd_; ++i) { + shape[i] = shape_w[i]; + } + for (int i = 0; i < nd_; ++i) { + strides[i] = strides_w[i]; + } + + return nd_; +} + +/* + For purposes of iterating over pairs of elements of two arrays + with `shape` and strides `strides1`, `strides2` given as pointers + `simplify_iteration_two_strides(nd, shape_ptr, strides1_ptr, + strides2_ptr, disp1, disp2)` + may modify memory and returns new length of these arrays. + + The new shape and new strides, as well as the offset + `(new_shape, new_strides1, disp1, new_stride2, disp2)` are such that + iterating over them will traverse the same set of pairs of elements, + possibly in a different order. + */ +template +int simplify_iteration_two_strides(const int nd, + ShapeTy *shape, + StridesTy *strides1, + StridesTy *strides2, + StridesTy &disp1, + StridesTy &disp2) +{ + disp1 = StridesTy(0); + disp2 = StridesTy(0); + if (nd < 2) + return nd; + + std::vector pos(nd); + std::iota(pos.begin(), pos.end(), 0); + + std::stable_sort( + pos.begin(), pos.end(), [&strides1, &strides2, &shape](int i1, int i2) { + auto abs_str1_i1 = + (strides1[i1] < 0) ? -strides1[i1] : strides1[i1]; + auto abs_str1_i2 = + (strides1[i2] < 0) ? -strides1[i2] : strides1[i2]; + auto abs_str2_i1 = + (strides2[i1] < 0) ? -strides2[i1] : strides2[i1]; + auto abs_str2_i2 = + (strides2[i2] < 0) ? -strides2[i2] : strides2[i2]; + return (abs_str2_i1 > abs_str2_i2) || + (abs_str2_i1 == abs_str2_i2 && + (abs_str1_i1 > abs_str1_i2 || + (abs_str1_i1 == abs_str1_i2 && shape[i1] > shape[i2]))); + }); + + std::vector shape_w; + std::vector strides1_w; + std::vector strides2_w; + + bool contractable = true; + for (int i = 0; i < nd; ++i) { + auto p = pos[i]; + auto sh_p = shape[p]; + auto str1_p = strides1[p]; + auto str2_p = strides2[p]; + shape_w.push_back(sh_p); + if (str1_p <= 0 && str2_p <= 0 && std::min(str1_p, str2_p) < 0) { + disp1 += str1_p * (sh_p - 1); + str1_p = -str1_p; + disp2 += str2_p * (sh_p - 1); + str2_p = -str2_p; + } + if (str1_p < 0 || str2_p < 0) { + contractable = false; + } + strides1_w.push_back(str1_p); + strides2_w.push_back(str2_p); + } + + int nd_ = nd; + while (contractable) { + bool changed = false; + for (int i = 0; i + 1 < nd_; ++i) { + StridesTy str1 = strides1_w[i + 1]; + StridesTy str2 = strides2_w[i + 1]; + StridesTy jump1 = strides1_w[i] - (shape_w[i + 1] - 1) * str1; + StridesTy jump2 = strides2_w[i] - (shape_w[i + 1] - 1) * str2; + + if (jump1 == str1 && jump2 == str2) { + changed = true; + shape_w[i] *= shape_w[i + 1]; + for (int j = i; j < nd_; ++j) { + strides1_w[j] = strides1_w[j + 1]; + } + for (int j = i; j < nd_; ++j) { + strides2_w[j] = strides2_w[j + 1]; + } + for (int j = i + 1; j + 1 < nd_; ++j) { + shape_w[j] = shape_w[j + 1]; + } + --nd_; + break; + } + } + if (!changed) + break; + } + for (int i = 0; i < nd_; ++i) { + shape[i] = shape_w[i]; + } + for (int i = 0; i < nd_; ++i) { + strides1[i] = strides1_w[i]; + } + for (int i = 0; i < nd_; ++i) { + strides2[i] = strides2_w[i]; + } + + return nd_; +} + +template > +std::tuple contract_iter(const vecT &shape, const vecT &strides) +{ + const std::size_t dim = shape.size(); + if (dim != strides.size()) { + throw Error("Shape and strides must be of equal size."); + } + vecT out_shape = shape; + vecT out_strides = strides; + T disp(0); + + int nd = simplify_iteration_stride(dim, out_shape.data(), + out_strides.data(), disp); + out_shape.resize(nd); + out_strides.resize(nd); + return std::make_tuple(out_shape, out_strides, disp); +} + +template > +std::tuple contract_iter2(const vecT &shape, + const vecT &strides1, + const vecT &strides2) +{ + const std::size_t dim = shape.size(); + if (dim != strides1.size() || dim != strides2.size()) { + throw Error("Shape and strides must be of equal size."); + } + vecT out_shape = shape; + vecT out_strides1 = strides1; + vecT out_strides2 = strides2; + T disp1(0); + T disp2(0); + + int nd = simplify_iteration_two_strides(dim, out_shape.data(), + out_strides1.data(), + out_strides2.data(), disp1, disp2); + out_shape.resize(nd); + out_strides1.resize(nd); + out_strides2.resize(nd); + return std::make_tuple(out_shape, out_strides1, disp1, out_strides2, disp2); +} + +/* + For purposes of iterating over pairs of elements of three arrays + with `shape` and strides `strides1`, `strides2`, `strides3` given as + pointers `simplify_iteration_three_strides(nd, shape_ptr, strides1_ptr, + strides2_ptr, strides3_ptr, disp1, disp2, disp3)` + may modify memory and returns new length of these arrays. + + The new shape and new strides, as well as the offset + `(new_shape, new_strides1, disp1, new_stride2, disp2, new_stride3, disp3)` + are such that iterating over them will traverse the same set of tuples of + elements, possibly in a different order. + */ +template +int simplify_iteration_three_strides(const int nd, + ShapeTy *shape, + StridesTy *strides1, + StridesTy *strides2, + StridesTy *strides3, + StridesTy &disp1, + StridesTy &disp2, + StridesTy &disp3) +{ + disp1 = StridesTy(0); + disp2 = StridesTy(0); + if (nd < 2) + return nd; + + std::vector pos(nd); + std::iota(pos.begin(), pos.end(), 0); + + std::stable_sort(pos.begin(), pos.end(), + [&strides1, &strides2, &strides3, &shape](int i1, int i2) { + auto abs_str1_i1 = + (strides1[i1] < 0) ? -strides1[i1] : strides1[i1]; + auto abs_str1_i2 = + (strides1[i2] < 0) ? -strides1[i2] : strides1[i2]; + auto abs_str2_i1 = + (strides2[i1] < 0) ? -strides2[i1] : strides2[i1]; + auto abs_str2_i2 = + (strides2[i2] < 0) ? -strides2[i2] : strides2[i2]; + auto abs_str3_i1 = + (strides3[i1] < 0) ? -strides3[i1] : strides3[i1]; + auto abs_str3_i2 = + (strides3[i2] < 0) ? -strides3[i2] : strides3[i2]; + return (abs_str3_i1 > abs_str3_i2) || + ((abs_str3_i1 == abs_str3_i2) && + ((abs_str2_i1 > abs_str2_i2) || + ((abs_str2_i1 == abs_str2_i2) && + ((abs_str1_i1 > abs_str1_i2) || + ((abs_str1_i1 == abs_str1_i2) && + (shape[i1] > shape[i2])))))); + }); + + std::vector shape_w; + std::vector strides1_w; + std::vector strides2_w; + std::vector strides3_w; + + bool contractable = true; + for (int i = 0; i < nd; ++i) { + auto p = pos[i]; + auto sh_p = shape[p]; + auto str1_p = strides1[p]; + auto str2_p = strides2[p]; + auto str3_p = strides3[p]; + shape_w.push_back(sh_p); + if (str1_p <= 0 && str2_p <= 0 && str3_p <= 0 && + std::min({str1_p, str2_p, str3_p}) < 0) { + disp1 += str1_p * (sh_p - 1); + str1_p = -str1_p; + disp2 += str2_p * (sh_p - 1); + str2_p = -str2_p; + disp3 += str3_p * (sh_p - 1); + str3_p = -str3_p; + } + if (str1_p < 0 || str2_p < 0 || str3_p < 0) { + contractable = false; + } + strides1_w.push_back(str1_p); + strides2_w.push_back(str2_p); + strides3_w.push_back(str3_p); + } + int nd_ = nd; + while (contractable) { + bool changed = false; + for (int i = 0; i + 1 < nd_; ++i) { + StridesTy str1 = strides1_w[i + 1]; + StridesTy str2 = strides2_w[i + 1]; + StridesTy str3 = strides3_w[i + 1]; + StridesTy jump1 = strides1_w[i] - (shape_w[i + 1] - 1) * str1; + StridesTy jump2 = strides2_w[i] - (shape_w[i + 1] - 1) * str2; + StridesTy jump3 = strides3_w[i] - (shape_w[i + 1] - 1) * str3; + + if (jump1 == str1 && jump2 == str2 && jump3 == str3) { + changed = true; + shape_w[i] *= shape_w[i + 1]; + for (int j = i; j < nd_; ++j) { + strides1_w[j] = strides1_w[j + 1]; + } + for (int j = i; j < nd_; ++j) { + strides2_w[j] = strides2_w[j + 1]; + } + for (int j = i; j < nd_; ++j) { + strides3_w[j] = strides3_w[j + 1]; + } + for (int j = i + 1; j + 1 < nd_; ++j) { + shape_w[j] = shape_w[j + 1]; + } + --nd_; + break; + } + } + if (!changed) + break; + } + for (int i = 0; i < nd_; ++i) { + shape[i] = shape_w[i]; + } + for (int i = 0; i < nd_; ++i) { + strides1[i] = strides1_w[i]; + } + for (int i = 0; i < nd_; ++i) { + strides2[i] = strides2_w[i]; + } + for (int i = 0; i < nd_; ++i) { + strides3[i] = strides3_w[i]; + } + + return nd_; +} + +template > +std::tuple contract_iter3(const vecT &shape, + const vecT &strides1, + const vecT &strides2, + const vecT &strides3) +{ + const std::size_t dim = shape.size(); + if (dim != strides1.size() || dim != strides2.size() || + dim != strides3.size()) { + throw Error("Shape and strides must be of equal size."); + } + vecT out_shape = shape; + vecT out_strides1 = strides1; + vecT out_strides2 = strides2; + vecT out_strides3 = strides3; + T disp1(0); + T disp2(0); + T disp3(0); + + int nd = simplify_iteration_three_strides( + dim, out_shape.data(), out_strides1.data(), out_strides2.data(), + out_strides3.data(), disp1, disp2, disp3); + out_shape.resize(nd); + out_strides1.resize(nd); + out_strides2.resize(nd); + out_strides3.resize(nd); + return std::make_tuple(out_shape, out_strides1, disp1, out_strides2, disp2, + out_strides3, disp3); +} + +/* + For purposes of iterating over pairs of elements of four arrays + with `shape` and strides `strides1`, `strides2`, `strides3`, + `strides4` given as pointers `simplify_iteration_four_strides(nd, + shape_ptr, strides1_ptr, strides2_ptr, strides3_ptr, strides4_ptr, + disp1, disp2, disp3, disp4)` may modify memory and returns new + length of these arrays. + + The new shape and new strides, as well as the offset + `(new_shape, new_strides1, disp1, new_stride2, disp2, new_stride3, disp3, + new_stride4, disp4)` are such that iterating over them will traverse the + same set of tuples of elements, possibly in a different order. + */ +template +int simplify_iteration_four_strides(const int nd, + ShapeTy *shape, + StridesTy *strides1, + StridesTy *strides2, + StridesTy *strides3, + StridesTy *strides4, + StridesTy &disp1, + StridesTy &disp2, + StridesTy &disp3, + StridesTy &disp4) +{ + disp1 = StridesTy(0); + disp2 = StridesTy(0); + if (nd < 2) + return nd; + + std::vector pos(nd); + std::iota(pos.begin(), pos.end(), 0); + + std::stable_sort( + pos.begin(), pos.end(), + [&strides1, &strides2, &strides3, &strides4, &shape](int i1, int i2) { + auto abs_str1_i1 = + (strides1[i1] < 0) ? -strides1[i1] : strides1[i1]; + auto abs_str1_i2 = + (strides1[i2] < 0) ? -strides1[i2] : strides1[i2]; + auto abs_str2_i1 = + (strides2[i1] < 0) ? -strides2[i1] : strides2[i1]; + auto abs_str2_i2 = + (strides2[i2] < 0) ? -strides2[i2] : strides2[i2]; + auto abs_str3_i1 = + (strides3[i1] < 0) ? -strides3[i1] : strides3[i1]; + auto abs_str3_i2 = + (strides3[i2] < 0) ? -strides3[i2] : strides3[i2]; + auto abs_str4_i1 = + (strides4[i1] < 0) ? -strides4[i1] : strides4[i1]; + auto abs_str4_i2 = + (strides4[i2] < 0) ? -strides4[i2] : strides4[i2]; + return (abs_str4_i1 > abs_str4_i2) || + ((abs_str4_i1 == abs_str4_i2) && + ((abs_str3_i1 > abs_str3_i2) || + ((abs_str3_i1 == abs_str3_i2) && + ((abs_str2_i1 > abs_str2_i2) || + ((abs_str2_i1 == abs_str2_i2) && + ((abs_str1_i1 > abs_str1_i2) || + ((abs_str1_i1 == abs_str1_i2) && + (shape[i1] > shape[i2])))))))); + }); + + std::vector shape_w; + std::vector strides1_w; + std::vector strides2_w; + std::vector strides3_w; + std::vector strides4_w; + + bool contractable = true; + for (int i = 0; i < nd; ++i) { + auto p = pos[i]; + auto sh_p = shape[p]; + auto str1_p = strides1[p]; + auto str2_p = strides2[p]; + auto str3_p = strides3[p]; + auto str4_p = strides4[p]; + shape_w.push_back(sh_p); + if (str1_p <= 0 && str2_p <= 0 && str3_p <= 0 && str4_p <= 0 && + std::min({str1_p, str2_p, str3_p, str4_p}) < 0) { + disp1 += str1_p * (sh_p - 1); + str1_p = -str1_p; + disp2 += str2_p * (sh_p - 1); + str2_p = -str2_p; + disp3 += str3_p * (sh_p - 1); + str3_p = -str3_p; + disp4 += str4_p * (sh_p - 1); + str4_p = -str4_p; + } + if (str1_p < 0 || str2_p < 0 || str3_p < 0 || str4_p < 0) { + contractable = false; + } + strides1_w.push_back(str1_p); + strides2_w.push_back(str2_p); + strides3_w.push_back(str3_p); + strides4_w.push_back(str4_p); + } + int nd_ = nd; + while (contractable) { + bool changed = false; + for (int i = 0; i + 1 < nd_; ++i) { + StridesTy str1 = strides1_w[i + 1]; + StridesTy str2 = strides2_w[i + 1]; + StridesTy str3 = strides3_w[i + 1]; + StridesTy str4 = strides4_w[i + 1]; + StridesTy jump1 = strides1_w[i] - (shape_w[i + 1] - 1) * str1; + StridesTy jump2 = strides2_w[i] - (shape_w[i + 1] - 1) * str2; + StridesTy jump3 = strides3_w[i] - (shape_w[i + 1] - 1) * str3; + StridesTy jump4 = strides4_w[i] - (shape_w[i + 1] - 1) * str4; + + if (jump1 == str1 && jump2 == str2 && jump3 == str3 && + jump4 == str4) { + changed = true; + shape_w[i] *= shape_w[i + 1]; + for (int j = i; j < nd_; ++j) { + strides1_w[j] = strides1_w[j + 1]; + } + for (int j = i; j < nd_; ++j) { + strides2_w[j] = strides2_w[j + 1]; + } + for (int j = i; j < nd_; ++j) { + strides3_w[j] = strides3_w[j + 1]; + } + for (int j = i; j < nd_; ++j) { + strides4_w[j] = strides4_w[j + 1]; + } + for (int j = i + 1; j + 1 < nd_; ++j) { + shape_w[j] = shape_w[j + 1]; + } + --nd_; + break; + } + } + if (!changed) + break; + } + for (int i = 0; i < nd_; ++i) { + shape[i] = shape_w[i]; + } + for (int i = 0; i < nd_; ++i) { + strides1[i] = strides1_w[i]; + } + for (int i = 0; i < nd_; ++i) { + strides2[i] = strides2_w[i]; + } + for (int i = 0; i < nd_; ++i) { + strides3[i] = strides3_w[i]; + } + for (int i = 0; i < nd_; ++i) { + strides4[i] = strides4_w[i]; + } + + return nd_; +} + +template > +std::tuple + contract_iter4(const vecT &shape, + const vecT &strides1, + const vecT &strides2, + const vecT &strides3, + const vecT &strides4) +{ + const std::size_t dim = shape.size(); + if (dim != strides1.size() || dim != strides2.size() || + dim != strides3.size() || dim != strides4.size()) { + throw Error("Shape and strides must be of equal size."); + } + vecT out_shape = shape; + vecT out_strides1 = strides1; + vecT out_strides2 = strides2; + vecT out_strides3 = strides3; + vecT out_strides4 = strides4; + T disp1(0); + T disp2(0); + T disp3(0); + T disp4(0); + + int nd = simplify_iteration_four_strides( + dim, out_shape.data(), out_strides1.data(), out_strides2.data(), + out_strides3.data(), out_strides4.data(), disp1, disp2, disp3, disp4); + out_shape.resize(nd); + out_strides1.resize(nd); + out_strides2.resize(nd); + out_strides3.resize(nd); + out_strides4.resize(nd); + return std::make_tuple(out_shape, out_strides1, disp1, out_strides2, disp2, + out_strides3, disp3, out_strides4, disp4); +} + +/* + For purposes of iterating over elements of an array with `shape` and + strides `strides` given as pointers `compact_iteration(nd, shape, strides)` + may modify memory and returns the new length of the array. + + The new shape and new strides `(new_shape, new_strides)` are such that + iterating over them will traverse the same elements in the same order, + possibly with reduced dimensionality. + */ +template +int compact_iteration(const int nd, ShapeTy *shape, StridesTy *strides) +{ + if (nd < 2) + return nd; + + bool contractable = true; + for (int i = 0; i < nd; ++i) { + if (strides[i] < 0) { + contractable = false; + } + } + + int nd_ = nd; + while (contractable) { + bool changed = false; + for (int i = 0; i + 1 < nd_; ++i) { + StridesTy str = strides[i + 1]; + StridesTy jump = strides[i] - (shape[i + 1] - 1) * str; + + if (jump == str) { + changed = true; + shape[i] *= shape[i + 1]; + for (int j = i; j < nd_; ++j) { + strides[j] = strides[j + 1]; + } + for (int j = i + 1; j + 1 < nd_; ++j) { + shape[j] = shape[j + 1]; + } + --nd_; + break; + } + } + if (!changed) + break; + } + + return nd_; +} +} // namespace dpctl::tensor::strides diff --git a/dpnp/tensor/libtensor/include/utils/sycl_alloc_utils.hpp b/dpnp/tensor/libtensor/include/utils/sycl_alloc_utils.hpp new file mode 100644 index 000000000000..76f0174b9fdf --- /dev/null +++ b/dpnp/tensor/libtensor/include/utils/sycl_alloc_utils.hpp @@ -0,0 +1,223 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +/// +/// \file +/// This file defines CIndexer_array, and CIndexer_vector classes, as well +/// iteration space simplifiers. +//===----------------------------------------------------------------------===// + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +namespace dpctl::tensor::alloc_utils +{ +template +class usm_host_allocator : public sycl::usm_allocator +{ +public: + using baseT = sycl::usm_allocator; + using baseT::baseT; + + template + struct rebind + { + typedef usm_host_allocator other; + }; + + void deallocate(T *ptr, std::size_t n) + { + try { + baseT::deallocate(ptr, n); + } catch (const std::exception &e) { + std::cerr + << "Exception caught in `usm_host_allocator::deallocate`: " + << e.what() << std::endl; + } + } +}; + +template +void sycl_free_noexcept(T *ptr, const sycl::context &ctx) noexcept +{ + try { + sycl::free(ptr, ctx); + } catch (const std::exception &e) { + std::cerr << "Call to sycl::free caught exception: " << e.what() + << std::endl; + } +} + +template +void sycl_free_noexcept(T *ptr, const sycl::queue &q) noexcept +{ + sycl_free_noexcept(ptr, q.get_context()); +} + +class USMDeleter +{ +private: + sycl::context ctx_; + +public: + USMDeleter(const sycl::queue &q) : ctx_(q.get_context()) {} + USMDeleter(const sycl::context &ctx) : ctx_(ctx) {} + + template + void operator()(T *ptr) const + { + sycl_free_noexcept(ptr, ctx_); + } +}; + +template +std::unique_ptr + smart_malloc(std::size_t count, + const sycl::queue &q, + sycl::usm::alloc kind, + const sycl::property_list &propList = {}) +{ + T *ptr = sycl::malloc(count, q, kind, propList); + if (nullptr == ptr) { + throw std::runtime_error("Unable to allocate device_memory"); + } + + auto usm_deleter = USMDeleter(q); + return std::unique_ptr(ptr, usm_deleter); +} + +template +std::unique_ptr + smart_malloc_device(std::size_t count, + const sycl::queue &q, + const sycl::property_list &propList = {}) +{ + return smart_malloc(count, q, sycl::usm::alloc::device, propList); +} + +template +std::unique_ptr + smart_malloc_shared(std::size_t count, + const sycl::queue &q, + const sycl::property_list &propList = {}) +{ + return smart_malloc(count, q, sycl::usm::alloc::shared, propList); +} + +template +std::unique_ptr + smart_malloc_host(std::size_t count, + const sycl::queue &q, + const sycl::property_list &propList = {}) +{ + return smart_malloc(count, q, sycl::usm::alloc::host, propList); +} + +namespace detail +{ +template +struct valid_smart_ptr : public std::false_type +{ +}; + +template +struct valid_smart_ptr &> + : public std::is_same +{ +}; + +template +struct valid_smart_ptr> + : public std::is_same +{ +}; + +// base case +template +struct all_valid_smart_ptrs +{ + static constexpr bool value = true; +}; + +template +struct all_valid_smart_ptrs +{ + static constexpr bool value = valid_smart_ptr::value && + (all_valid_smart_ptrs::value); +}; +} // end of namespace detail + +/*! @brief Submit host_task and transfer ownership from smart pointers to it */ +template +sycl::event async_smart_free(sycl::queue &exec_q, + const std::vector &depends, + UniquePtrTs &&...unique_pointers) +{ + static constexpr std::size_t n = sizeof...(UniquePtrTs); + static_assert( + n > 0, "async_smart_free requires at least one smart pointer argument"); + + static_assert( + detail::all_valid_smart_ptrs::value, + "async_smart_free requires unique_ptr created with smart_malloc"); + + std::vector ptrs; + ptrs.reserve(n); + (ptrs.push_back(reinterpret_cast(unique_pointers.get())), ...); + + std::vector dels; + dels.reserve(n); + (dels.emplace_back(unique_pointers.get_deleter()), ...); + + sycl::event ht_e = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + cgh.host_task([ptrs = std::move(ptrs), dels = std::move(dels)]() { + for (std::size_t i = 0; i < ptrs.size(); ++i) { + dels[i](ptrs[i]); + } + }); + }); + + // Upon successful submission of host_task, USM allocations are owned + // by the host_task. Release smart pointer ownership to avoid double + // deallocation + (unique_pointers.release(), ...); + + return ht_e; +} +} // namespace dpctl::tensor::alloc_utils diff --git a/dpnp/tensor/libtensor/include/utils/sycl_utils.hpp b/dpnp/tensor/libtensor/include/utils/sycl_utils.hpp new file mode 100644 index 000000000000..9ae41e5ade6e --- /dev/null +++ b/dpnp/tensor/libtensor/include/utils/sycl_utils.hpp @@ -0,0 +1,674 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +/// +/// \file +/// This file defines utilities used for kernel submission. +//===----------------------------------------------------------------------===// + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "math_utils.hpp" + +namespace dpctl::tensor::sycl_utils +{ +namespace detail +{ +template +struct TypeList; + +template +struct TypeList +{ + using head = Head; + using tail = TypeList; +}; + +using NullTypeList = TypeList<>; +template +struct IsNullTypeList : std::conditional_t, + std::true_type, + std::false_type> +{ +}; + +// recursively check if type is contained in given TypeList +template +struct IsContained + : std::conditional_t< + std::is_same_v>, + std::true_type, + IsContained> +{ +}; + +template <> +struct TypeList<> +{ +}; + +// std::false_type when last case has been checked for membership +template +struct IsContained : std::false_type +{ +}; + +template +struct IsComplex : std::false_type +{ +}; +template +struct IsComplex> : std::true_type +{ +}; +} // namespace detail + +template +using sycl_ops = detail::TypeList, + sycl::bit_or, + sycl::bit_xor, + sycl::bit_and, + sycl::maximum, + sycl::minimum, + sycl::multiplies>; + +template +struct IsSyclOp +{ + static constexpr bool value = + detail::IsContained>>::value || + detail::IsContained>>::value; +}; + +/*! @brief Find the smallest multiple of supported sub-group size larger than + * nelems */ +template +std::size_t choose_workgroup_size(const std::size_t nelems, + const std::vector &sg_sizes) +{ + std::vector wg_choices; + wg_choices.reserve(f * sg_sizes.size()); + + for (const auto &sg_size : sg_sizes) { +#pragma unroll + for (std::size_t i = 1; i <= f; ++i) { + wg_choices.push_back(sg_size * i); + } + } + std::sort(std::begin(wg_choices), std::end(wg_choices)); + + std::size_t wg = 1; + for (std::size_t i = 0; i < wg_choices.size(); ++i) { + if (wg_choices[i] == wg) { + continue; + } + wg = wg_choices[i]; + std::size_t n_groups = ((nelems + wg - 1) / wg); + if (n_groups == 1) + break; + } + + return wg; +} + +namespace detail +{ + +template +void _fold(LocAccT &local_mem_acc, + const std::uint32_t lid, + const std::uint32_t cutoff, + const std::uint32_t step, + const OpT &op) +{ + if (lid < cutoff) { + local_mem_acc[lid] = op(local_mem_acc[lid], local_mem_acc[step + lid]); + } +} + +template +void _fold(LocAccT &local_mem_acc, + const std::uint32_t lid, + const std::uint32_t step, + const OpT &op) +{ + if (lid < step) { + local_mem_acc[lid] = op(local_mem_acc[lid], local_mem_acc[step + lid]); + } +} + +} // end of namespace detail + +template +T custom_reduce_over_group(const GroupT &wg, + LocAccT local_mem_acc, + const T &local_val, + const OpT &op) +{ + // value experimentally tuned to achieve best runtime on Iris Xe, + // Arc A140V integrated Intel GPUs, and discrete Intel Max GPU. + static constexpr std::uint32_t low_sz = 8u; + // maximal work-group size + static constexpr std::uint32_t high_sz = 1024u; + const std::uint32_t wgs = wg.get_local_linear_range(); + const std::uint32_t lid = wg.get_local_linear_id(); + + local_mem_acc[lid] = local_val; + sycl::group_barrier(wg, sycl::memory_scope::work_group); + + std::uint32_t n_witems = wgs; + if (wgs & (wgs - 1)) { + // wgs is not a power of 2 +#pragma unroll + for (std::uint32_t sz = high_sz; sz >= low_sz; sz >>= 1) { + if (n_witems >= sz) { + const std::uint32_t n_witems_ = (n_witems + 1) >> 1; + detail::_fold(local_mem_acc, lid, n_witems - n_witems_, + n_witems_, op); + sycl::group_barrier(wg, sycl::memory_scope::work_group); + n_witems = n_witems_; + } + } + } + else { + // wgs is a power of 2 +#pragma unroll + for (std::uint32_t sz = high_sz; sz >= low_sz; sz >>= 1) { + if (n_witems >= sz) { + n_witems >>= 1; + detail::_fold(local_mem_acc, lid, n_witems, op); + sycl::group_barrier(wg, sycl::memory_scope::work_group); + } + } + } + + T red_val_over_wg = local_mem_acc[0]; + if (wg.leader()) { + for (std::uint32_t i = 1; i < n_witems; ++i) { + red_val_over_wg = op(red_val_over_wg, local_mem_acc[i]); + } + } + + return sycl::group_broadcast(wg, red_val_over_wg, 0); +} + +template +T custom_inclusive_scan_over_group(GroupT &&wg, + SubGroupT &&sg, + LocAccT &&local_mem_acc, + const T &local_val, + const T &identity, + OpT &&op) +{ + const std::uint32_t local_id = wg.get_local_id(0); + const std::uint32_t wgs = wg.get_local_range(0); + + const std::uint32_t lane_id = sg.get_local_id()[0]; + const std::uint32_t sgSize = sg.get_local_range()[0]; + + T scan_val = local_val; + for (std::uint32_t step = 1; step < sgSize; step *= 2) { + const bool advanced_lane = (lane_id >= step); + const std::uint32_t src_lane_id = + (advanced_lane ? lane_id - step : lane_id); + const T modifier = sycl::select_from_group(sg, scan_val, src_lane_id); + if (advanced_lane) { + scan_val = op(scan_val, modifier); + } + } + + local_mem_acc[local_id] = scan_val; + sycl::group_barrier(wg, sycl::memory_scope::work_group); + + const std::uint32_t max_sgSize = sg.get_max_local_range()[0]; + const std::uint32_t sgr_id = sg.get_group_id()[0]; + + // now scan + const std::uint32_t n_aggregates = 1 + ((wgs - 1) / max_sgSize); + const bool large_wg = (n_aggregates > max_sgSize); + if (large_wg) { + if (wg.leader()) { + T _scan_val = identity; + for (std::uint32_t i = 1; i <= n_aggregates - max_sgSize; ++i) { + _scan_val = op(local_mem_acc[i * max_sgSize - 1], _scan_val); + local_mem_acc[i * max_sgSize - 1] = _scan_val; + } + } + sycl::group_barrier(wg, sycl::memory_scope::work_group); + } + + if (sgr_id == 0) { + const std::uint32_t offset = + (large_wg) ? n_aggregates - max_sgSize : 0u; + const bool in_range = (lane_id < n_aggregates); + const bool in_bounds = in_range && (lane_id > 0 || large_wg); + + // Here is a bug where IGC incorrectly optimized the below code: + // T __scan_val = (in_bounds) + // ? local_mem_acc[(offset + lane_id) * max_sgSize - 1] + // : identity; + // That causes `__scan_val` is not initialized with `identity` value: + // wgs = 256, max_sgSize = 16 => n_aggregates = 16 + // wi = 0: in_range = 1, in_bounds = 0 => __scan_val = identity + // The w/s adds SYCL atomic fence, since the explicit memory fence + // prevents reordering/elimination, while it will add slight overhead. + T __scan_val = identity; + sycl::atomic_fence(sycl::memory_order::relaxed, + sycl::memory_scope::work_item); + if (in_bounds) { + __scan_val = local_mem_acc[(offset + lane_id) * max_sgSize - 1]; + } + for (std::uint32_t step = 1; step < sgSize; step *= 2) { + const bool advanced_lane = (lane_id >= step); + const std::uint32_t src_lane_id = + (advanced_lane ? lane_id - step : lane_id); + const T modifier = + sycl::select_from_group(sg, __scan_val, src_lane_id); + if (advanced_lane && in_range) { + __scan_val = op(__scan_val, modifier); + } + } + if (in_bounds) { + local_mem_acc[(offset + lane_id) * max_sgSize - 1] = __scan_val; + } + } + sycl::group_barrier(wg, sycl::memory_scope::work_group); + + if (sgr_id > 0) { + const T modifier = local_mem_acc[sgr_id * max_sgSize - 1]; + scan_val = op(scan_val, modifier); + } + + // ensure all work-items finished reading from SLM + sycl::group_barrier(wg, sycl::memory_scope::work_group); + + return scan_val; +} + +// Reduction functors + +// Maximum + +template +struct Maximum +{ + T operator()(const T &x, const T &y) const + { + if constexpr (detail::IsComplex::value) { + using dpctl::tensor::math_utils::max_complex; + return max_complex(x, y); + } + else if constexpr (std::is_floating_point_v || + std::is_same_v) { + return (std::isnan(x) || x > y) ? x : y; + } + else if constexpr (std::is_same_v) { + return x || y; + } + else { + return (x > y) ? x : y; + } + } +}; + +// Minimum + +template +struct Minimum +{ + T operator()(const T &x, const T &y) const + { + if constexpr (detail::IsComplex::value) { + using dpctl::tensor::math_utils::min_complex; + return min_complex(x, y); + } + else if constexpr (std::is_floating_point_v || + std::is_same_v) { + return (std::isnan(x) || x < y) ? x : y; + } + else if constexpr (std::is_same_v) { + return x && y; + } + else { + return (x < y) ? x : y; + } + } +}; + +// Define identities and operator checking structs + +template +struct GetIdentity +{ +}; + +// Maximum + +template +using IsMaximum = std::bool_constant> || + std::is_same_v>>; + +template +using IsSyclMaximum = std::bool_constant>>; + +template +struct GetIdentity::value>> +{ + static constexpr T value = + static_cast(std::numeric_limits::has_infinity + ? static_cast(-std::numeric_limits::infinity()) + : std::numeric_limits::lowest()); +}; + +template +struct GetIdentity::value>> +{ + static constexpr bool value = false; +}; + +template +struct GetIdentity, + std::enable_if_t, Op>::value>> +{ + static constexpr std::complex value{-std::numeric_limits::infinity(), + -std::numeric_limits::infinity()}; +}; + +// Minimum + +template +using IsMinimum = std::bool_constant> || + std::is_same_v>>; + +template +using IsSyclMinimum = std::bool_constant>>; + +template +struct GetIdentity::value>> +{ + static constexpr T value = + static_cast(std::numeric_limits::has_infinity + ? static_cast(std::numeric_limits::infinity()) + : std::numeric_limits::max()); +}; + +template +struct GetIdentity::value>> +{ + static constexpr bool value = true; +}; + +template +struct GetIdentity, + std::enable_if_t, Op>::value>> +{ + static constexpr std::complex value{std::numeric_limits::infinity(), + std::numeric_limits::infinity()}; +}; + +// Plus + +template +using IsPlus = std::bool_constant> || + std::is_same_v>>; + +template +using IsSyclPlus = std::bool_constant>>; + +// Multiplies + +template +using IsMultiplies = + std::bool_constant> || + std::is_same_v>>; + +template +using IsSyclMultiplies = + std::bool_constant>>; + +template +struct GetIdentity::value>> +{ + static constexpr T value = static_cast(1); +}; + +// LogSumExp + +template +struct LogSumExp +{ + T operator()(const T &x, const T &y) const + { + using dpctl::tensor::math_utils::logaddexp; + return logaddexp(x, y); + } +}; + +template +using IsLogSumExp = std::bool_constant>>; + +// only defined for types with infinity +template +struct GetIdentity::value>> +{ + static constexpr T value = -std::numeric_limits::infinity(); +}; + +// Hypot + +template +struct Hypot +{ + T operator()(const T &x, const T &y) const { return sycl::hypot(x, y); } +}; + +template +using IsHypot = std::bool_constant>>; + +template +struct GetIdentity::value>> +{ + static constexpr T value = 0; +}; + +// Logical_And + +template +using IsLogicalAnd = + std::bool_constant> || + std::is_same_v>>; + +template +using IsSyclLogicalAnd = + std::bool_constant>>; + +template +struct GetIdentity::value>> +{ + static constexpr T value = static_cast(1); +}; + +// Logical_Or + +template +using IsLogicalOr = + std::bool_constant> || + std::is_same_v>>; + +template +using IsSyclLogicalOr = + std::bool_constant>>; + +template +struct GetIdentity::value>> +{ + static constexpr T value = static_cast(0); +}; + +// Identity + +template +struct Identity +{ +}; + +template +using UseBuiltInIdentity = + std::conjunction, sycl::has_known_identity>; + +template +struct Identity::value>> +{ + static constexpr T value = GetIdentity::value; +}; + +template +struct Identity::value>> +{ + static constexpr T value = sycl::known_identity::value; +}; + +// Sub-group load/store + +#ifndef USE_GROUP_LOAD_STORE +#if defined(SYCL_EXT_ONEAPI_GROUP_LOAD_STORE) && \ + SYCL_EXT_ONEAPI_GROUP_LOAD_STORE +#define USE_GROUP_LOAD_STORE 1 +#else +#if defined(__LIBSYCL_MAJOR_VERSION) && (__LIBSYCL_MAJOR_VERSION >= 8u) +#define USE_GROUP_LOAD_STORE 1 +#else +#define USE_GROUP_LOAD_STORE 0 +#endif +#endif +#endif + +#if (USE_GROUP_LOAD_STORE) +namespace ls_ns = sycl::ext::oneapi::experimental; +#endif + +template +auto sub_group_load(const sycl::sub_group &sg, + sycl::multi_ptr m_ptr) +{ +#if (USE_GROUP_LOAD_STORE) + using ValueT = typename std::remove_cv_t; + sycl::vec x{}; + static constexpr auto striped = + ls_ns::properties{ls_ns::data_placement_striped}; + ls_ns::group_load(sg, m_ptr, x, striped); + return x; +#else + return sg.load(m_ptr); +#endif +} + +template +auto sub_group_load(const sycl::sub_group &sg, + sycl::multi_ptr m_ptr) +{ +#if (USE_GROUP_LOAD_STORE) + using ValueT = typename std::remove_cv_t; + ValueT x{}; + static constexpr auto striped = + ls_ns::properties{ls_ns::data_placement_striped}; + ls_ns::group_load(sg, m_ptr, x, striped); + return x; +#else + return sg.load(m_ptr); +#endif +} + +template +std::enable_if_t< + std::is_same_v, std::remove_cv_t>, + void> + sub_group_store(const sycl::sub_group &sg, + const sycl::vec &val, + sycl::multi_ptr m_ptr) +{ +#if (USE_GROUP_LOAD_STORE) + static_assert(std::is_same_v); + static constexpr auto striped = + ls_ns::properties{ls_ns::data_placement_striped}; + ls_ns::group_store(sg, val, m_ptr, striped); + return; +#else + sg.store(m_ptr, val); + return; +#endif +} + +template +std::enable_if_t< + std::is_same_v, std::remove_cv_t>, + void> + sub_group_store(const sycl::sub_group &sg, + const VecT &val, + sycl::multi_ptr m_ptr) +{ +#if (USE_GROUP_LOAD_STORE) + static constexpr auto striped = + ls_ns::properties{ls_ns::data_placement_striped}; + ls_ns::group_store(sg, val, m_ptr, striped); + return; +#else + sg.store(m_ptr, val); + return; +#endif +} +} // namespace dpctl::tensor::sycl_utils diff --git a/dpnp/tensor/libtensor/include/utils/type_dispatch.hpp b/dpnp/tensor/libtensor/include/utils/type_dispatch.hpp new file mode 100644 index 000000000000..bead0da5093e --- /dev/null +++ b/dpnp/tensor/libtensor/include/utils/type_dispatch.hpp @@ -0,0 +1,135 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +/// +/// \file +/// This file defines class to implement dispatch tables for pair of types +//===----------------------------------------------------------------------===// + +#pragma once + +#include +#include +#include +#include + +#include "dpnp4pybind11.hpp" + +#include "type_dispatch_building.hpp" + +namespace dpctl::tensor::type_dispatch +{ +struct usm_ndarray_types +{ + int typenum_to_lookup_id(int typenum) const + { + using typenum_t = ::dpctl::tensor::type_dispatch::typenum_t; + auto const &api = ::dpctl::detail::dpctl_capi::get(); + + if (typenum == api.UAR_DOUBLE_) { + return static_cast(typenum_t::DOUBLE); + } + else if (typenum == api.UAR_INT64_) { + return static_cast(typenum_t::INT64); + } + else if (typenum == api.UAR_INT32_) { + return static_cast(typenum_t::INT32); + } + else if (typenum == api.UAR_BOOL_) { + return static_cast(typenum_t::BOOL); + } + else if (typenum == api.UAR_CDOUBLE_) { + return static_cast(typenum_t::CDOUBLE); + } + else if (typenum == api.UAR_FLOAT_) { + return static_cast(typenum_t::FLOAT); + } + else if (typenum == api.UAR_INT16_) { + return static_cast(typenum_t::INT16); + } + else if (typenum == api.UAR_INT8_) { + return static_cast(typenum_t::INT8); + } + else if (typenum == api.UAR_UINT64_) { + return static_cast(typenum_t::UINT64); + } + else if (typenum == api.UAR_UINT32_) { + return static_cast(typenum_t::UINT32); + } + else if (typenum == api.UAR_UINT16_) { + return static_cast(typenum_t::UINT16); + } + else if (typenum == api.UAR_UINT8_) { + return static_cast(typenum_t::UINT8); + } + else if (typenum == api.UAR_CFLOAT_) { + return static_cast(typenum_t::CFLOAT); + } + else if (typenum == api.UAR_HALF_) { + return static_cast(typenum_t::HALF); + } + else if (typenum == api.UAR_INT_ || typenum == api.UAR_UINT_) { + switch (sizeof(int)) { + case sizeof(std::int32_t): + return ((typenum == api.UAR_INT_) + ? static_cast(typenum_t::INT32) + : static_cast(typenum_t::UINT32)); + case sizeof(std::int64_t): + return ((typenum == api.UAR_INT_) + ? static_cast(typenum_t::INT64) + : static_cast(typenum_t::UINT64)); + default: + throw_unrecognized_typenum_error(typenum); + } + } + else if (typenum == api.UAR_LONGLONG_ || + typenum == api.UAR_ULONGLONG_) { + switch (sizeof(long long)) { + case sizeof(std::int64_t): + return ((typenum == api.UAR_LONGLONG_) + ? static_cast(typenum_t::INT64) + : static_cast(typenum_t::UINT64)); + default: + throw_unrecognized_typenum_error(typenum); + } + } + else { + throw_unrecognized_typenum_error(typenum); + } + // return code signalling error, should never be reached + assert(false); + return -1; + } + +private: + void throw_unrecognized_typenum_error(int typenum) const + { + throw std::runtime_error("Unrecognized typenum " + + std::to_string(typenum) + " encountered."); + } +}; +} // namespace dpctl::tensor::type_dispatch diff --git a/dpnp/tensor/libtensor/include/utils/type_dispatch_building.hpp b/dpnp/tensor/libtensor/include/utils/type_dispatch_building.hpp new file mode 100644 index 000000000000..7170624b5bbe --- /dev/null +++ b/dpnp/tensor/libtensor/include/utils/type_dispatch_building.hpp @@ -0,0 +1,293 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +/// +/// \file +/// This file defines class to implement dispatch tables for pair of types +//===----------------------------------------------------------------------===// + +#pragma once + +#include +#include +#include +#include +#include + +#include + +namespace dpctl::tensor::type_dispatch +{ +enum class typenum_t : int +{ + BOOL = 0, + INT8, // 1 + UINT8, + INT16, + UINT16, + INT32, // 5 + UINT32, + INT64, + UINT64, + HALF, + FLOAT, // 10 + DOUBLE, + CFLOAT, + CDOUBLE, // 13 +}; +inline constexpr int num_types = 14; // number of elements in typenum_t + +template typename factory, + int _num_types> +class DispatchTableBuilder +{ +private: + template + const std::vector row_per_dst_type() const + { + std::vector per_dstTy = { + factory{}.get(), + factory{}.get(), + factory{}.get(), + factory{}.get(), + factory{}.get(), + factory{}.get(), + factory{}.get(), + factory{}.get(), + factory{}.get(), + factory{}.get(), + factory{}.get(), + factory{}.get(), + factory>{}.get(), + factory>{}.get()}; + assert(per_dstTy.size() == _num_types); + return per_dstTy; + } + +public: + DispatchTableBuilder() = default; + ~DispatchTableBuilder() = default; + + void populate_dispatch_table(funcPtrT table[][_num_types]) const + { + const auto map_by_dst_type = {row_per_dst_type(), + row_per_dst_type(), + row_per_dst_type(), + row_per_dst_type(), + row_per_dst_type(), + row_per_dst_type(), + row_per_dst_type(), + row_per_dst_type(), + row_per_dst_type(), + row_per_dst_type(), + row_per_dst_type(), + row_per_dst_type(), + row_per_dst_type>(), + row_per_dst_type>()}; + assert(map_by_dst_type.size() == _num_types); + int dst_id = 0; + for (const auto &row : map_by_dst_type) { + int src_id = 0; + for (const auto &fn_ptr : row) { + table[dst_id][src_id] = fn_ptr; + ++src_id; + } + ++dst_id; + } + } +}; + +template typename factory, + int _num_types> +class DispatchVectorBuilder +{ +private: + template + const funcPtrT func_per_type() const + { + funcPtrT f = factory{}.get(); + return f; + } + +public: + DispatchVectorBuilder() = default; + ~DispatchVectorBuilder() = default; + + void populate_dispatch_vector(funcPtrT vector[]) const + { + const auto fn_map_by_type = {func_per_type(), + func_per_type(), + func_per_type(), + func_per_type(), + func_per_type(), + func_per_type(), + func_per_type(), + func_per_type(), + func_per_type(), + func_per_type(), + func_per_type(), + func_per_type(), + func_per_type>(), + func_per_type>()}; + assert(fn_map_by_type.size() == _num_types); + int ty_id = 0; + for (const auto &fn : fn_map_by_type) { + vector[ty_id] = fn; + ++ty_id; + } + } +}; + +/*! @brief struct to define result_type typename for Ty == ArgTy */ +template +struct TypeMapResultEntry : std::is_same +{ + using result_type = ResTy; +}; + +/*! @brief struct to define result_type typename for Ty1 == ArgTy1 && Ty2 == + * ArgTy2 */ +template +struct BinaryTypeMapResultEntry + : std::conjunction, std::is_same> +{ + using result_type = ResTy; +}; + +/*! @brief fall-through struct with specified result_type, usually void */ +template +struct DefaultResultEntry : std::true_type +{ + using result_type = Ty; +}; + +/*! @brief Utility struct to convert C++ type into typeid integer */ +template +struct GetTypeid +{ + int get() + { + if constexpr (std::is_same_v) { + return static_cast(typenum_t::BOOL); + } + else if constexpr (std::is_same_v) { + return static_cast(typenum_t::INT8); + } + else if constexpr (std::is_same_v) { + return static_cast(typenum_t::UINT8); + } + else if constexpr (std::is_same_v) { + return static_cast(typenum_t::INT16); + } + else if constexpr (std::is_same_v) { + return static_cast(typenum_t::UINT16); + } + else if constexpr (std::is_same_v) { + return static_cast(typenum_t::INT32); + } + else if constexpr (std::is_same_v) { + return static_cast(typenum_t::UINT32); + } + else if constexpr (std::is_same_v) { + return static_cast(typenum_t::INT64); + } + else if constexpr (std::is_same_v) { + return static_cast(typenum_t::UINT64); + } + else if constexpr (std::is_same_v) { + return static_cast(typenum_t::HALF); + } + else if constexpr (std::is_same_v) { + return static_cast(typenum_t::FLOAT); + } + else if constexpr (std::is_same_v) { + return static_cast(typenum_t::DOUBLE); + } + else if constexpr (std::is_same_v>) { + return static_cast(typenum_t::CFLOAT); + } + else if constexpr (std::is_same_v>) { + return static_cast(typenum_t::CDOUBLE); + } + else if constexpr (std::is_same_v) { // special token + return -1; + } + + assert(("Unsupported type T", false)); + return -2; + } +}; + +/*! @brief Class to generate vector of null function pointers */ +template +struct NullPtrVector +{ + + using value_type = FunPtrT; + using const_reference = value_type const &; + + NullPtrVector() : val(nullptr) {} + + const_reference operator[](int) const { return val; } + +private: + value_type val; +}; + +/*! @brief Class to generate table of null function pointers */ +template +struct NullPtrTable +{ + using value_type = NullPtrVector; + using const_reference = value_type const &; + + NullPtrTable() : val() {} + + const_reference operator[](int) const { return val; } + +private: + value_type val; +}; + +template +struct TypePairDefinedEntry + : std::conjunction, std::is_same> +{ + static constexpr bool is_defined = true; +}; + +struct NotDefinedEntry : std::true_type +{ + static constexpr bool is_defined = false; +}; +} // namespace dpctl::tensor::type_dispatch diff --git a/dpnp/tensor/libtensor/include/utils/type_utils.hpp b/dpnp/tensor/libtensor/include/utils/type_utils.hpp new file mode 100644 index 000000000000..47b1a5554815 --- /dev/null +++ b/dpnp/tensor/libtensor/include/utils/type_utils.hpp @@ -0,0 +1,163 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +/// +/// \file +/// This file defines functions for value casting. +//===----------------------------------------------------------------------===// + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include + +namespace dpctl::tensor::type_utils +{ +template +struct is_complex : public std::false_type +{ +}; + +template +struct is_complex< + T, + std::enable_if_t, std::complex> || + std::is_same_v, std::complex>>> + : public std::true_type +{ +}; + +template +inline constexpr bool is_complex_v = is_complex::value; + +template +dstTy convert_impl(const srcTy &v) +{ + if constexpr (std::is_same_v) { + return v; + } + else if constexpr (std::is_same_v) { + if constexpr (is_complex_v) { + // bool(complex_v) == + // (complex_v.real() != 0) && (complex_v.imag() !=0) + return (convert_impl(v.real()) || + convert_impl(v.imag())); + } + else { + return static_cast(v != srcTy{0}); + } + } + else if constexpr (std::is_same_v) { + // C++ interprets a byte of storage behind bool by only + // testing is least significant bit, leading to both + // 0x00 and 0x02 interpreted as False, while 0x01 and 0xFF + // interpreted as True. NumPy's interpretation of underlying + // storage is different: any bit set is interpreted as True, + // no bits set as False, see gh-2121 + const std::uint8_t &u = sycl::bit_cast(v); + if constexpr (is_complex_v) { + return (u == 0) ? dstTy{} : dstTy{1, 0}; + } + else { + return (u == 0) ? dstTy{} : dstTy{1}; + } + } + else if constexpr (is_complex_v && !is_complex_v) { + // real_t(complex_v) == real_t(complex_v.real()) + return convert_impl(v.real()); + } + else if constexpr (!std::is_integral_v && + !std::is_same_v && + std::is_integral_v && std::is_unsigned_v) { + // first cast to signed variant, the cast to unsigned one + using signedT = typename std::make_signed_t; + return static_cast(convert_impl(v)); + } + else { + return static_cast(v); + } +} + +template +void validate_type_for_device(const sycl::device &d) +{ + if constexpr (std::is_same_v) { + if (!d.has(sycl::aspect::fp64)) { + throw std::runtime_error("Device " + + d.get_info() + + " does not support type 'float64'"); + } + } + else if constexpr (std::is_same_v>) { + if (!d.has(sycl::aspect::fp64)) { + throw std::runtime_error("Device " + + d.get_info() + + " does not support type 'complex128'"); + } + } + else if constexpr (std::is_same_v) { + if (!d.has(sycl::aspect::fp16)) { + throw std::runtime_error("Device " + + d.get_info() + + " does not support type 'float16'"); + } + } +} + +template +void validate_type_for_device(const sycl::queue &q) +{ + validate_type_for_device(q.get_device()); +} + +template +auto vec_cast_impl(const Vec &v, std::index_sequence) +{ + return Op{v[I]...}; +} + +template > +auto vec_cast(const sycl::vec &s) +{ + if constexpr (std::is_same_v) { + return s; + } + else { + return vec_cast_impl, sycl::vec>(s, + Indices{}); + } +} +} // namespace dpctl::tensor::type_utils diff --git a/dpnp/tensor/libtensor/source/accumulators.cpp b/dpnp/tensor/libtensor/source/accumulators.cpp new file mode 100644 index 000000000000..c6ab96418d47 --- /dev/null +++ b/dpnp/tensor/libtensor/source/accumulators.cpp @@ -0,0 +1,407 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "dpnp4pybind11.hpp" +#include + +#include "kernels/accumulators.hpp" +#include "simplify_iteration_space.hpp" +#include "utils/memory_overlap.hpp" +#include "utils/offset_utils.hpp" +#include "utils/output_validation.hpp" +#include "utils/sycl_alloc_utils.hpp" +#include "utils/type_dispatch.hpp" + +namespace dpctl::tensor::py_internal +{ + +// Computation of positions of masked elements + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +using dpctl::tensor::kernels::accumulators::cumsum_val_contig_impl_fn_ptr_t; +static cumsum_val_contig_impl_fn_ptr_t + mask_positions_contig_i64_dispatch_vector[td_ns::num_types]; +static cumsum_val_contig_impl_fn_ptr_t + mask_positions_contig_i32_dispatch_vector[td_ns::num_types]; + +using dpctl::tensor::kernels::accumulators::cumsum_val_strided_impl_fn_ptr_t; +static cumsum_val_strided_impl_fn_ptr_t + mask_positions_strided_i64_dispatch_vector[td_ns::num_types]; +static cumsum_val_strided_impl_fn_ptr_t + mask_positions_strided_i32_dispatch_vector[td_ns::num_types]; + +void populate_mask_positions_dispatch_vectors(void) +{ + using dpctl::tensor::kernels::accumulators:: + MaskPositionsContigFactoryForInt64; + td_ns::DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(mask_positions_contig_i64_dispatch_vector); + + using dpctl::tensor::kernels::accumulators:: + MaskPositionsContigFactoryForInt32; + td_ns::DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(mask_positions_contig_i32_dispatch_vector); + + using dpctl::tensor::kernels::accumulators:: + MaskPositionsStridedFactoryForInt64; + td_ns::DispatchVectorBuilder + dvb3; + dvb3.populate_dispatch_vector(mask_positions_strided_i64_dispatch_vector); + + using dpctl::tensor::kernels::accumulators:: + MaskPositionsStridedFactoryForInt32; + td_ns::DispatchVectorBuilder + dvb4; + dvb4.populate_dispatch_vector(mask_positions_strided_i32_dispatch_vector); + + return; +} + +std::size_t py_mask_positions(const dpctl::tensor::usm_ndarray &mask, + const dpctl::tensor::usm_ndarray &cumsum, + sycl::queue &exec_q, + const std::vector &depends) +{ + dpctl::tensor::validation::CheckWritable::throw_if_not_writable(cumsum); + + // cumsum is 1D + if (cumsum.get_ndim() != 1) { + throw py::value_error("Result array must be one-dimensional."); + } + + if (!cumsum.is_c_contiguous()) { + throw py::value_error("Expecting `cumsum` array must be C-contiguous."); + } + + // cumsum.shape == (mask.size,) + auto mask_size = mask.get_size(); + auto cumsum_size = cumsum.get_shape(0); + if (cumsum_size != mask_size) { + throw py::value_error("Inconsistent dimensions"); + } + + if (!dpctl::utils::queues_are_compatible(exec_q, {mask, cumsum})) { + // FIXME: use ExecutionPlacementError + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + if (mask_size == 0) { + return 0; + } + + int mask_typenum = mask.get_typenum(); + int cumsum_typenum = cumsum.get_typenum(); + + // mask can be any type + const char *mask_data = mask.get_data(); + char *cumsum_data = cumsum.get_data(); + + auto const &array_types = td_ns::usm_ndarray_types(); + + int mask_typeid = array_types.typenum_to_lookup_id(mask_typenum); + int cumsum_typeid = array_types.typenum_to_lookup_id(cumsum_typenum); + + // cumsum must be int32_t/int64_t only + static constexpr int int32_typeid = + static_cast(td_ns::typenum_t::INT32); + static constexpr int int64_typeid = + static_cast(td_ns::typenum_t::INT64); + if (cumsum_typeid != int32_typeid && cumsum_typeid != int64_typeid) { + throw py::value_error( + "Cumulative sum array must have int32 or int64 data-type."); + } + + const bool use_i32 = (cumsum_typeid == int32_typeid); + + std::vector host_task_events; + + if (mask.is_c_contiguous()) { + auto fn = (use_i32) + ? mask_positions_contig_i32_dispatch_vector[mask_typeid] + : mask_positions_contig_i64_dispatch_vector[mask_typeid]; + + std::size_t total_set; + + { + py::gil_scoped_release release; + + total_set = fn(exec_q, mask_size, mask_data, cumsum_data, + host_task_events, depends); + + sycl::event::wait(host_task_events); + } + return total_set; + } + + const py::ssize_t *shape = mask.get_shape_raw(); + auto const &strides_vector = mask.get_strides_vector(); + + using shT = std::vector; + shT compact_shape; + shT compact_strides; + + int mask_nd = mask.get_ndim(); + int nd = mask_nd; + + compact_iteration_space(nd, shape, strides_vector, compact_shape, + compact_strides); + + // Strided implementation + auto strided_fn = + (use_i32) ? mask_positions_strided_i32_dispatch_vector[mask_typeid] + : mask_positions_strided_i64_dispatch_vector[mask_typeid]; + + using dpctl::tensor::offset_utils::device_allocate_and_pack; + auto ptr_size_event_tuple = device_allocate_and_pack( + exec_q, host_task_events, compact_shape, compact_strides); + auto shape_strides_owner = std::move(std::get<0>(ptr_size_event_tuple)); + sycl::event copy_shape_ev = std::get<2>(ptr_size_event_tuple); + const py::ssize_t *shape_strides = shape_strides_owner.get(); + + if (2 * static_cast(nd) != std::get<1>(ptr_size_event_tuple)) { + { + py::gil_scoped_release release; + + copy_shape_ev.wait(); + sycl::event::wait(host_task_events); + + // ensure deleter of smart pointer is invoked with GIL released + shape_strides_owner.reset(nullptr); + } + throw std::runtime_error("Unexpected error"); + } + + std::vector dependent_events; + dependent_events.reserve(depends.size() + 1); + dependent_events.insert(dependent_events.end(), copy_shape_ev); + dependent_events.insert(dependent_events.end(), depends.begin(), + depends.end()); + + std::size_t total_set; + + { + py::gil_scoped_release release; + + total_set = strided_fn(exec_q, mask_size, mask_data, nd, shape_strides, + cumsum_data, host_task_events, dependent_events); + + sycl::event::wait(host_task_events); + // ensure deleter of smart pointer is invoked with GIL released + shape_strides_owner.reset(nullptr); + } + + return total_set; +} + +using dpctl::tensor::kernels::accumulators::cumsum_val_strided_impl_fn_ptr_t; +static cumsum_val_strided_impl_fn_ptr_t + cumsum_1d_strided_dispatch_vector[td_ns::num_types]; +using dpctl::tensor::kernels::accumulators::cumsum_val_contig_impl_fn_ptr_t; +static cumsum_val_contig_impl_fn_ptr_t + cumsum_1d_contig_dispatch_vector[td_ns::num_types]; + +void populate_cumsum_1d_dispatch_vectors(void) +{ + using dpctl::tensor::kernels::accumulators::Cumsum1DContigFactory; + td_ns::DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(cumsum_1d_contig_dispatch_vector); + + using dpctl::tensor::kernels::accumulators::Cumsum1DStridedFactory; + td_ns::DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(cumsum_1d_strided_dispatch_vector); + + return; +} + +std::size_t py_cumsum_1d(const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &cumsum, + sycl::queue &exec_q, + std::vector const &depends) +{ + // cumsum is 1D + if (cumsum.get_ndim() != 1) { + throw py::value_error("cumsum array must be one-dimensional."); + } + + if (!cumsum.is_c_contiguous()) { + throw py::value_error("Expecting `cumsum` array to be C-contiguous."); + } + + // cumsum.shape == (src.size,) + auto src_size = src.get_size(); + auto cumsum_size = cumsum.get_shape(0); + if (cumsum_size != src_size) { + throw py::value_error("Inconsistent dimensions"); + } + + if (!dpctl::utils::queues_are_compatible(exec_q, {src, cumsum})) { + // FIXME: use ExecutionPlacementError + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + dpctl::tensor::validation::CheckWritable::throw_if_not_writable(cumsum); + + if (src_size == 0) { + return 0; + } + + int src_typenum = src.get_typenum(); + int cumsum_typenum = cumsum.get_typenum(); + + // src can be any type + const char *src_data = src.get_data(); + char *cumsum_data = cumsum.get_data(); + + auto const &array_types = td_ns::usm_ndarray_types(); + + int src_typeid = array_types.typenum_to_lookup_id(src_typenum); + int cumsum_typeid = array_types.typenum_to_lookup_id(cumsum_typenum); + + // this cumsum must be int64_t only + static constexpr int int64_typeid = + static_cast(td_ns::typenum_t::INT64); + if (cumsum_typeid != int64_typeid) { + throw py::value_error( + "Cumulative sum array must have int64 data-type."); + } + + std::vector host_task_events; + + if (src.is_c_contiguous()) { + auto fn = cumsum_1d_contig_dispatch_vector[src_typeid]; + if (fn == nullptr) { + throw std::runtime_error( + "this cumsum requires integer type, got src_typeid=" + + std::to_string(src_typeid)); + } + std::size_t total = fn(exec_q, src_size, src_data, cumsum_data, + host_task_events, depends); + { + py::gil_scoped_release release; + sycl::event::wait(host_task_events); + } + return total; + } + + const py::ssize_t *shape = src.get_shape_raw(); + auto const &strides_vector = src.get_strides_vector(); + + using shT = std::vector; + shT compact_shape; + shT compact_strides; + + int src_nd = src.get_ndim(); + int nd = src_nd; + + compact_iteration_space(nd, shape, strides_vector, compact_shape, + compact_strides); + + // Strided implementation + auto strided_fn = cumsum_1d_strided_dispatch_vector[src_typeid]; + if (strided_fn == nullptr) { + throw std::runtime_error( + "this cumsum requires integer type, got src_typeid=" + + std::to_string(src_typeid)); + } + + using dpctl::tensor::offset_utils::device_allocate_and_pack; + auto ptr_size_event_tuple = device_allocate_and_pack( + exec_q, host_task_events, compact_shape, compact_strides); + auto shape_strides_owner = std::move(std::get<0>(ptr_size_event_tuple)); + sycl::event copy_shape_ev = std::get<2>(ptr_size_event_tuple); + const py::ssize_t *shape_strides = shape_strides_owner.get(); + + if (2 * static_cast(nd) != std::get<1>(ptr_size_event_tuple)) { + { + py::gil_scoped_release release; + + copy_shape_ev.wait(); + sycl::event::wait(host_task_events); + + // ensure USM deleter is called with GIL released + shape_strides_owner.reset(nullptr); + } + throw std::runtime_error("Unexpected error"); + } + + std::vector dependent_events; + dependent_events.reserve(depends.size() + 1); + dependent_events.insert(dependent_events.end(), copy_shape_ev); + dependent_events.insert(dependent_events.end(), depends.begin(), + depends.end()); + + std::size_t total = + strided_fn(exec_q, src_size, src_data, nd, shape_strides, cumsum_data, + host_task_events, dependent_events); + + { + py::gil_scoped_release release; + sycl::event::wait(host_task_events); + + // ensure USM deleter is called with GIL released + shape_strides_owner.reset(nullptr); + } + + return total; +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/accumulators.hpp b/dpnp/tensor/libtensor/source/accumulators.hpp new file mode 100644 index 000000000000..e400aad2dceb --- /dev/null +++ b/dpnp/tensor/libtensor/source/accumulators.hpp @@ -0,0 +1,61 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===----------------------------------------------------------------------===// + +#pragma once +#include +#include + +#include + +#include "dpnp4pybind11.hpp" + +namespace dpctl::tensor::py_internal +{ + +extern void populate_mask_positions_dispatch_vectors(void); + +extern std::size_t + py_mask_positions(const dpctl::tensor::usm_ndarray &mask, + const dpctl::tensor::usm_ndarray &cumsum, + sycl::queue &exec_q, + const std::vector &depends = {}); + +extern void populate_cumsum_1d_dispatch_vectors(void); + +extern std::size_t py_cumsum_1d(const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &cumsum, + sycl::queue &exec_q, + std::vector const &depends = {}); + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/accumulators/accumulate_over_axis.hpp b/dpnp/tensor/libtensor/source/accumulators/accumulate_over_axis.hpp new file mode 100644 index 000000000000..bce47c45f9b1 --- /dev/null +++ b/dpnp/tensor/libtensor/source/accumulators/accumulate_over_axis.hpp @@ -0,0 +1,461 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_accumulation_impl +// extensions +//===----------------------------------------------------------------------===// + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include + +#include "kernels/accumulators.hpp" +#include "simplify_iteration_space.hpp" +#include "utils/memory_overlap.hpp" +#include "utils/offset_utils.hpp" +#include "utils/output_validation.hpp" +#include "utils/sycl_alloc_utils.hpp" +#include "utils/type_dispatch.hpp" + +namespace dpctl::tensor::py_internal +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +template +std::pair + py_accumulate_over_axis(const dpctl::tensor::usm_ndarray &src, + const int trailing_dims_to_accumulate, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + std::vector const &depends, + const strided_fnT &strided_dispatch_table, + const contig_fnT &contig_dispatch_table) +{ + int src_nd = src.get_ndim(); + int dst_nd = dst.get_ndim(); + if (src_nd != dst_nd) { + throw py::value_error("The input and output arrays must have " + "the same array ranks"); + } + int iter_nd = src_nd - trailing_dims_to_accumulate; + if (trailing_dims_to_accumulate <= 0 || iter_nd < 0) { + throw py::value_error( + "trailing_dims_to_accumulate must be positive, but no " + "greater than rank of the input array"); + } + + const py::ssize_t *src_shape_ptr = src.get_shape_raw(); + const py::ssize_t *dst_shape_ptr = dst.get_shape_raw(); + + bool same_shapes = true; + std::size_t iter_nelems(1); + for (int i = 0; same_shapes && (i < iter_nd); ++i) { + auto src_shape_i = src_shape_ptr[i]; + same_shapes = same_shapes && (src_shape_i == dst_shape_ptr[i]); + iter_nelems *= static_cast(src_shape_i); + } + + std::size_t acc_nelems(1); + for (int i = iter_nd; same_shapes && (i < src_nd); ++i) { + auto dst_shape_i = dst_shape_ptr[i]; + same_shapes = same_shapes && (src_shape_ptr[i] == dst_shape_i); + acc_nelems *= static_cast(dst_shape_i); + } + + if (!same_shapes) { + throw py::value_error( + "Destination shape does not match the input shape"); + } + + if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst); + + if ((iter_nelems == 0) || (acc_nelems == 0)) { + return std::make_pair(sycl::event(), sycl::event()); + } + + auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); + if (overlap(src, dst)) { + throw py::value_error("Arrays index overlapping segments of memory"); + } + + dpctl::tensor::validation::AmpleMemory::throw_if_not_ample( + dst, acc_nelems * iter_nelems); + + const char *src_data = src.get_data(); + char *dst_data = dst.get_data(); + + int src_typenum = src.get_typenum(); + int dst_typenum = dst.get_typenum(); + + const auto &array_types = td_ns::usm_ndarray_types(); + int src_typeid = array_types.typenum_to_lookup_id(src_typenum); + int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum); + + bool is_src_c_contig = src.is_c_contiguous(); + bool is_dst_c_contig = dst.is_c_contiguous(); + + std::vector host_task_events; + + if ((is_src_c_contig && is_dst_c_contig) && iter_nd == 0) { + auto fn = contig_dispatch_table[src_typeid][dst_typeid]; + if (fn == nullptr) { + throw std::runtime_error("Datatypes are not supported"); + } + + sycl::event acc_ev = fn(exec_q, acc_nelems, src_data, dst_data, + host_task_events, depends); + + return std::make_pair( + dpctl::utils::keep_args_alive(exec_q, {src, dst}, {acc_ev}), + acc_ev); + } + + auto src_shape_vec = src.get_shape_vector(); + auto src_strides_vec = src.get_strides_vector(); + auto dst_strides_vec = dst.get_strides_vector(); + + int acc_nd = trailing_dims_to_accumulate; + + using shT = std::vector; + shT acc_shape(std::begin(src_shape_vec) + iter_nd, std::end(src_shape_vec)); + + shT acc_src_strides(std::begin(src_strides_vec) + iter_nd, + std::end(src_strides_vec)); + + shT acc_dst_strides(std::begin(dst_strides_vec) + iter_nd, + std::end(dst_strides_vec)); + + shT iter_shape(std::begin(src_shape_vec), + std::begin(src_shape_vec) + iter_nd); + + shT iter_src_strides(std::begin(src_strides_vec), + std::begin(src_strides_vec) + iter_nd); + + shT iter_dst_strides(std::begin(dst_strides_vec), + std::begin(dst_strides_vec) + iter_nd); + + shT simplified_iter_shape; + shT simplified_iter_src_strides; + shT simplified_iter_dst_strides; + py::ssize_t iter_src_offset(0); + py::ssize_t iter_dst_offset(0); + + if (iter_nd == 0) { + iter_nd = 1; + simplified_iter_shape.push_back(1); + simplified_iter_src_strides.push_back(0); + simplified_iter_dst_strides.push_back(0); + } + else { + simplify_iteration_space( + iter_nd, src_shape_ptr, iter_src_strides, iter_dst_strides, + // output + simplified_iter_shape, simplified_iter_src_strides, + simplified_iter_dst_strides, iter_src_offset, iter_dst_offset); + } + + // Strided implementation + auto strided_fn = strided_dispatch_table[src_typeid][dst_typeid]; + if (strided_fn == nullptr) { + throw std::runtime_error("Datatypes are not supported"); + } + + using dpctl::tensor::offset_utils::device_allocate_and_pack; + auto ptr_size_event_tuple = device_allocate_and_pack( + exec_q, host_task_events, simplified_iter_shape, + simplified_iter_src_strides, simplified_iter_dst_strides, acc_shape, + acc_src_strides, acc_dst_strides); + auto packed_shapes_and_strides_owner = + std::move(std::get<0>(ptr_size_event_tuple)); + const auto ©_shapes_strides_ev = std::get<2>(ptr_size_event_tuple); + const py::ssize_t *packed_shapes_and_strides = + packed_shapes_and_strides_owner.get(); + + const py::ssize_t *iter_shape_and_strides = packed_shapes_and_strides; + const py::ssize_t *acc_shapes_and_strides = + packed_shapes_and_strides + 3 * simplified_iter_shape.size(); + + std::vector all_deps; + all_deps.reserve(depends.size() + 1); + all_deps.insert(all_deps.end(), copy_shapes_strides_ev); + all_deps.insert(all_deps.end(), depends.begin(), depends.end()); + + sycl::event acc_ev = strided_fn( + exec_q, iter_nelems, acc_nelems, src_data, iter_nd, + iter_shape_and_strides, iter_src_offset, iter_dst_offset, acc_nd, + acc_shapes_and_strides, dst_data, host_task_events, all_deps); + + sycl::event temp_cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {acc_ev}, packed_shapes_and_strides_owner); + host_task_events.push_back(temp_cleanup_ev); + + return std::make_pair( + dpctl::utils::keep_args_alive(exec_q, {src, dst}, host_task_events), + acc_ev); +} + +template +std::pair py_accumulate_final_axis_include_initial( + const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + std::vector const &depends, + const strided_fnT &strided_dispatch_table, + const contig_fnT &contig_dispatch_table) +{ + int src_nd = src.get_ndim(); + int dst_nd = dst.get_ndim(); + + if (src_nd != dst_nd) { + throw py::value_error("The input and output arrays must have " + "the same array ranks"); + } + + static constexpr int acc_nd = 1; + + int iter_nd = src_nd - acc_nd; + if (iter_nd < 0) { + throw py::value_error("accumulation axis must not be greater than rank " + "of the input array"); + } + + const py::ssize_t *src_shape_ptr = src.get_shape_raw(); + const py::ssize_t *dst_shape_ptr = dst.get_shape_raw(); + + bool same_shapes = true; + std::size_t iter_nelems(1); + for (int i = 0; same_shapes && (i < iter_nd); ++i) { + auto src_shape_i = src_shape_ptr[i]; + same_shapes = same_shapes && (src_shape_i == dst_shape_ptr[i]); + iter_nelems *= static_cast(src_shape_i); + } + + std::size_t acc_nelems(1); + for (int i = iter_nd; same_shapes && (i < src_nd); ++i) { + auto dst_shape_i = dst_shape_ptr[i]; + same_shapes = same_shapes && (src_shape_ptr[i] + 1 == dst_shape_i); + acc_nelems *= static_cast(dst_shape_i); + } + + if (!same_shapes) { + throw py::value_error( + "Destination shape does not match the input shape"); + } + + if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst); + + if ((iter_nelems == 0) || (acc_nelems == 0)) { + return std::make_pair(sycl::event(), sycl::event()); + } + + auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); + if (overlap(src, dst)) { + throw py::value_error("Arrays index overlapping segments of memory"); + } + + dpctl::tensor::validation::AmpleMemory::throw_if_not_ample( + dst, acc_nelems * iter_nelems); + + const char *src_data = src.get_data(); + char *dst_data = dst.get_data(); + + int src_typenum = src.get_typenum(); + int dst_typenum = dst.get_typenum(); + + const auto &array_types = td_ns::usm_ndarray_types(); + int src_typeid = array_types.typenum_to_lookup_id(src_typenum); + int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum); + + bool is_src_c_contig = src.is_c_contiguous(); + bool is_dst_c_contig = dst.is_c_contiguous(); + + std::vector host_task_events; + + if ((is_src_c_contig && is_dst_c_contig) && iter_nd == 0) { + auto fn = contig_dispatch_table[src_typeid][dst_typeid]; + if (fn == nullptr) { + throw std::runtime_error("Datatypes are not supported"); + } + + sycl::event acc_ev = fn(exec_q, acc_nelems, src_data, dst_data, + host_task_events, depends); + + return std::make_pair( + dpctl::utils::keep_args_alive(exec_q, {src, dst}, {acc_ev}), + acc_ev); + } + + auto src_shape_vec = src.get_shape_vector(); + auto src_strides_vec = src.get_strides_vector(); + auto dst_strides_vec = dst.get_strides_vector(); + + using shT = std::vector; + shT acc_shape(std::begin(src_shape_vec) + iter_nd, std::end(src_shape_vec)); + + shT acc_src_strides(std::begin(src_strides_vec) + iter_nd, + std::end(src_strides_vec)); + + shT acc_dst_strides(std::begin(dst_strides_vec) + iter_nd, + std::end(dst_strides_vec)); + + shT iter_shape(std::begin(src_shape_vec), + std::begin(src_shape_vec) + iter_nd); + + shT iter_src_strides(std::begin(src_strides_vec), + std::begin(src_strides_vec) + iter_nd); + + shT iter_dst_strides(std::begin(dst_strides_vec), + std::begin(dst_strides_vec) + iter_nd); + + shT simplified_iter_shape; + shT simplified_iter_src_strides; + shT simplified_iter_dst_strides; + py::ssize_t iter_src_offset(0); + py::ssize_t iter_dst_offset(0); + + if (iter_nd == 0) { + iter_nd = 1; + simplified_iter_shape.push_back(1); + simplified_iter_src_strides.push_back(0); + simplified_iter_dst_strides.push_back(0); + } + else { + simplify_iteration_space( + iter_nd, src_shape_ptr, iter_src_strides, iter_dst_strides, + // output + simplified_iter_shape, simplified_iter_src_strides, + simplified_iter_dst_strides, iter_src_offset, iter_dst_offset); + } + + // Strided implementation + auto strided_fn = strided_dispatch_table[src_typeid][dst_typeid]; + if (strided_fn == nullptr) { + throw std::runtime_error("Datatypes are not supported"); + } + + using dpctl::tensor::offset_utils::device_allocate_and_pack; + auto ptr_size_event_tuple = device_allocate_and_pack( + exec_q, host_task_events, simplified_iter_shape, + simplified_iter_src_strides, simplified_iter_dst_strides, acc_shape, + acc_src_strides, acc_dst_strides); + auto packed_shapes_and_strides_owner = + std::move(std::get<0>(ptr_size_event_tuple)); + const auto ©_shapes_strides_ev = std::get<2>(ptr_size_event_tuple); + const py::ssize_t *packed_shapes_and_strides = + packed_shapes_and_strides_owner.get(); + + const py::ssize_t *iter_shape_and_strides = packed_shapes_and_strides; + const py::ssize_t *acc_shapes_and_strides = + packed_shapes_and_strides + 3 * simplified_iter_shape.size(); + + std::vector all_deps; + all_deps.reserve(depends.size() + 1); + all_deps.insert(all_deps.end(), copy_shapes_strides_ev); + all_deps.insert(all_deps.end(), depends.begin(), depends.end()); + + sycl::event acc_ev = strided_fn( + exec_q, iter_nelems, acc_nelems, src_data, iter_nd, + iter_shape_and_strides, iter_src_offset, iter_dst_offset, acc_nd, + acc_shapes_and_strides, dst_data, host_task_events, all_deps); + + sycl::event temp_cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {acc_ev}, packed_shapes_and_strides_owner); + host_task_events.push_back(temp_cleanup_ev); + + return std::make_pair( + dpctl::utils::keep_args_alive(exec_q, {src, dst}, host_task_events), + acc_ev); +} + +/*! @brief Template implementing Python API for querying accumulation + * type support */ +template +bool py_accumulate_dtype_supported(const py::dtype &input_dtype, + const py::dtype &output_dtype, + const fnT &dispatch_table) +{ + int arg_tn = + input_dtype.num(); // NumPy type numbers are the same as in dpctl + int out_tn = + output_dtype.num(); // NumPy type numbers are the same as in dpctl + int arg_typeid = -1; + int out_typeid = -1; + + auto array_types = td_ns::usm_ndarray_types(); + + try { + arg_typeid = array_types.typenum_to_lookup_id(arg_tn); + out_typeid = array_types.typenum_to_lookup_id(out_tn); + } catch (const std::exception &e) { + throw py::value_error(e.what()); + } + + if (arg_typeid < 0 || arg_typeid >= td_ns::num_types || out_typeid < 0 || + out_typeid >= td_ns::num_types) { + throw std::runtime_error("Reduction type support check: lookup failed"); + } + + // remove_all_extents gets underlying type of table + using fn_ptrT = typename std::remove_all_extents::type; + fn_ptrT fn = nullptr; + + fn = dispatch_table[arg_typeid][out_typeid]; + + return (fn != nullptr); +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/accumulators/accumulators_common.cpp b/dpnp/tensor/libtensor/source/accumulators/accumulators_common.cpp new file mode 100644 index 000000000000..5e07e81b7ad5 --- /dev/null +++ b/dpnp/tensor/libtensor/source/accumulators/accumulators_common.cpp @@ -0,0 +1,55 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_accumulation_impl +// extensions +//===----------------------------------------------------------------------===// + +#include + +#include "cumulative_logsumexp.hpp" +#include "cumulative_prod.hpp" +#include "cumulative_sum.hpp" + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +/*! @brief Add accumulators to Python module */ +void init_accumulator_functions(py::module_ m) +{ + init_cumulative_logsumexp(m); + init_cumulative_prod(m); + init_cumulative_sum(m); +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/accumulators/accumulators_common.hpp b/dpnp/tensor/libtensor/source/accumulators/accumulators_common.hpp new file mode 100644 index 000000000000..c33a040a7fa7 --- /dev/null +++ b/dpnp/tensor/libtensor/source/accumulators/accumulators_common.hpp @@ -0,0 +1,46 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_accumulation_impl +// extensions +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern void init_accumulator_functions(py::module_); + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/accumulators/cumulative_logsumexp.cpp b/dpnp/tensor/libtensor/source/accumulators/cumulative_logsumexp.cpp new file mode 100644 index 000000000000..d4961c9edbf1 --- /dev/null +++ b/dpnp/tensor/libtensor/source/accumulators/cumulative_logsumexp.cpp @@ -0,0 +1,343 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_accumulation_impl +// extensions +//===----------------------------------------------------------------------===// + +#include +#include +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include +#include + +#include "accumulate_over_axis.hpp" +#include "kernels/accumulators.hpp" +#include "utils/sycl_utils.hpp" +#include "utils/type_dispatch_building.hpp" + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +namespace su_ns = dpctl::tensor::sycl_utils; +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace impl +{ + +using dpctl::tensor::kernels::accumulators::accumulate_1d_contig_impl_fn_ptr_t; +static accumulate_1d_contig_impl_fn_ptr_t + cumlogsumexp_1d_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; + +using dpctl::tensor::kernels::accumulators::accumulate_strided_impl_fn_ptr_t; +static accumulate_strided_impl_fn_ptr_t + cumlogsumexp_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; + +static accumulate_1d_contig_impl_fn_ptr_t + cumlogsumexp_1d_include_initial_contig_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +static accumulate_strided_impl_fn_ptr_t + cumlogsumexp_include_initial_strided_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +template +struct TypePairSupportDataForLogSumExpAccumulation +{ + static constexpr bool is_defined = std::disjunction< + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input int8_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input uint8_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input int16_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input uint16_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input int32_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input uint32_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input int64_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input uint64_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input half + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input float + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input double + td_ns::TypePairDefinedEntry, + + // fall-through + td_ns::NotDefinedEntry>::is_defined; +}; + +template +struct CumLogSumExp1DContigFactory +{ + fnT get() + { + if constexpr (TypePairSupportDataForLogSumExpAccumulation< + srcTy, dstTy>::is_defined) { + using ScanOpT = su_ns::LogSumExp; + static constexpr bool include_initial = false; + if constexpr (std::is_same_v) { + using dpctl::tensor::kernels::accumulators::NoOpTransformer; + fnT fn = dpctl::tensor::kernels::accumulators:: + accumulate_1d_contig_impl, ScanOpT, + include_initial>; + return fn; + } + else { + using dpctl::tensor::kernels::accumulators::CastTransformer; + fnT fn = dpctl::tensor::kernels::accumulators:: + accumulate_1d_contig_impl, + ScanOpT, include_initial>; + return fn; + } + } + else { + return nullptr; + } + } +}; + +template +struct CumLogSumExp1DIncludeInitialContigFactory +{ + fnT get() + { + if constexpr (TypePairSupportDataForLogSumExpAccumulation< + srcTy, dstTy>::is_defined) { + using ScanOpT = su_ns::LogSumExp; + static constexpr bool include_initial = true; + if constexpr (std::is_same_v) { + using dpctl::tensor::kernels::accumulators::NoOpTransformer; + fnT fn = dpctl::tensor::kernels::accumulators:: + accumulate_1d_contig_impl, ScanOpT, + include_initial>; + return fn; + } + else { + using dpctl::tensor::kernels::accumulators::CastTransformer; + fnT fn = dpctl::tensor::kernels::accumulators:: + accumulate_1d_contig_impl, + ScanOpT, include_initial>; + return fn; + } + } + else { + return nullptr; + } + } +}; + +template +struct CumLogSumExpStridedFactory +{ + fnT get() + { + if constexpr (TypePairSupportDataForLogSumExpAccumulation< + srcTy, dstTy>::is_defined) { + using ScanOpT = su_ns::LogSumExp; + static constexpr bool include_initial = false; + if constexpr (std::is_same_v) { + using dpctl::tensor::kernels::accumulators::NoOpTransformer; + fnT fn = dpctl::tensor::kernels::accumulators:: + accumulate_strided_impl, ScanOpT, + include_initial>; + return fn; + } + else { + using dpctl::tensor::kernels::accumulators::CastTransformer; + fnT fn = dpctl::tensor::kernels::accumulators:: + accumulate_strided_impl, + ScanOpT, include_initial>; + return fn; + } + } + else { + return nullptr; + } + } +}; + +template +struct CumLogSumExpIncludeInitialStridedFactory +{ + fnT get() + { + if constexpr (TypePairSupportDataForLogSumExpAccumulation< + srcTy, dstTy>::is_defined) { + using ScanOpT = su_ns::LogSumExp; + static constexpr bool include_initial = true; + if constexpr (std::is_same_v) { + using dpctl::tensor::kernels::accumulators::NoOpTransformer; + fnT fn = dpctl::tensor::kernels::accumulators:: + accumulate_strided_impl, ScanOpT, + include_initial>; + return fn; + } + else { + using dpctl::tensor::kernels::accumulators::CastTransformer; + fnT fn = dpctl::tensor::kernels::accumulators:: + accumulate_strided_impl, + ScanOpT, include_initial>; + return fn; + } + } + else { + return nullptr; + } + } +}; + +void populate_cumlogsumexp_dispatch_tables(void) +{ + td_ns::DispatchTableBuilder + dtb1; + dtb1.populate_dispatch_table(cumlogsumexp_1d_contig_dispatch_table); + + td_ns::DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(cumlogsumexp_strided_dispatch_table); + + td_ns::DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table( + cumlogsumexp_1d_include_initial_contig_dispatch_table); + + td_ns::DispatchTableBuilder + dtb4; + dtb4.populate_dispatch_table( + cumlogsumexp_include_initial_strided_dispatch_table); + + return; +} + +} // namespace impl + +void init_cumulative_logsumexp(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + + using impl::populate_cumlogsumexp_dispatch_tables; + populate_cumlogsumexp_dispatch_tables(); + + using impl::cumlogsumexp_1d_contig_dispatch_table; + using impl::cumlogsumexp_strided_dispatch_table; + auto cumlogsumexp_pyapi = [&](const arrayT &src, + int trailing_dims_to_accumulate, + const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_accumulate_over_axis(src, trailing_dims_to_accumulate, dst, + exec_q, depends, + cumlogsumexp_strided_dispatch_table, + cumlogsumexp_1d_contig_dispatch_table); + }; + m.def("_cumlogsumexp_over_axis", cumlogsumexp_pyapi, "", py::arg("src"), + py::arg("trailing_dims_to_accumulate"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + using impl::cumlogsumexp_1d_include_initial_contig_dispatch_table; + using impl::cumlogsumexp_include_initial_strided_dispatch_table; + auto cumlogsumexp_include_initial_pyapi = + [&](const arrayT &src, const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_accumulate_final_axis_include_initial( + src, dst, exec_q, depends, + cumlogsumexp_include_initial_strided_dispatch_table, + cumlogsumexp_1d_include_initial_contig_dispatch_table); + }; + m.def("_cumlogsumexp_final_axis_include_initial", + cumlogsumexp_include_initial_pyapi, "", py::arg("src"), + py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + + auto cumlogsumexp_dtype_supported = [&](const py::dtype &input_dtype, + const py::dtype &output_dtype) { + return py_accumulate_dtype_supported( + input_dtype, output_dtype, cumlogsumexp_strided_dispatch_table); + }; + m.def("_cumlogsumexp_dtype_supported", cumlogsumexp_dtype_supported, "", + py::arg("arg_dtype"), py::arg("out_dtype")); +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/accumulators/cumulative_logsumexp.hpp b/dpnp/tensor/libtensor/source/accumulators/cumulative_logsumexp.hpp new file mode 100644 index 000000000000..f1292320bd0d --- /dev/null +++ b/dpnp/tensor/libtensor/source/accumulators/cumulative_logsumexp.hpp @@ -0,0 +1,46 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_accumulation_impl +// extensions +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern void init_cumulative_logsumexp(py::module_ m); + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/accumulators/cumulative_prod.cpp b/dpnp/tensor/libtensor/source/accumulators/cumulative_prod.cpp new file mode 100644 index 000000000000..319709b30a76 --- /dev/null +++ b/dpnp/tensor/libtensor/source/accumulators/cumulative_prod.cpp @@ -0,0 +1,352 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_accumulation_impl +// extensions +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include +#include + +#include "accumulate_over_axis.hpp" +#include "kernels/accumulators.hpp" +#include "utils/type_dispatch_building.hpp" + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace impl +{ + +using dpctl::tensor::kernels::accumulators::accumulate_1d_contig_impl_fn_ptr_t; +static accumulate_1d_contig_impl_fn_ptr_t + cumprod_1d_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; + +using dpctl::tensor::kernels::accumulators::accumulate_strided_impl_fn_ptr_t; +static accumulate_strided_impl_fn_ptr_t + cumprod_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; + +static accumulate_1d_contig_impl_fn_ptr_t + cumprod_1d_include_initial_contig_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +static accumulate_strided_impl_fn_ptr_t + cumprod_include_initial_strided_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +template +struct TypePairSupportDataForProdAccumulation +{ + static constexpr bool is_defined = std::disjunction< + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input int8_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input uint8_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input int16_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input uint16_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input int32_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input uint32_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input int64_t + td_ns::TypePairDefinedEntry, + + // input uint64_t + td_ns::TypePairDefinedEntry, + + // input half + td_ns::TypePairDefinedEntry, + + // input float + td_ns::TypePairDefinedEntry, + + // input double + td_ns::TypePairDefinedEntry, + + // input std::complex + td_ns::TypePairDefinedEntry, + outTy, + std::complex>, + + td_ns::TypePairDefinedEntry, + outTy, + std::complex>, + + // fall-through + td_ns::NotDefinedEntry>::is_defined; +}; + +template +using CumProdScanOpT = std::conditional_t, + sycl::logical_and, + sycl::multiplies>; + +template +struct CumProd1DContigFactory +{ + fnT get() + { + if constexpr (TypePairSupportDataForProdAccumulation< + srcTy, dstTy>::is_defined) { + using ScanOpT = CumProdScanOpT; + static constexpr bool include_initial = false; + if constexpr (std::is_same_v) { + using dpctl::tensor::kernels::accumulators::NoOpTransformer; + fnT fn = dpctl::tensor::kernels::accumulators:: + accumulate_1d_contig_impl, ScanOpT, + include_initial>; + return fn; + } + else { + using dpctl::tensor::kernels::accumulators::CastTransformer; + fnT fn = dpctl::tensor::kernels::accumulators:: + accumulate_1d_contig_impl, + ScanOpT, include_initial>; + return fn; + } + } + else { + return nullptr; + } + } +}; + +template +struct CumProd1DIncludeInitialContigFactory +{ + fnT get() + { + if constexpr (TypePairSupportDataForProdAccumulation< + srcTy, dstTy>::is_defined) { + using ScanOpT = CumProdScanOpT; + static constexpr bool include_initial = true; + if constexpr (std::is_same_v) { + using dpctl::tensor::kernels::accumulators::NoOpTransformer; + fnT fn = dpctl::tensor::kernels::accumulators:: + accumulate_1d_contig_impl, ScanOpT, + include_initial>; + return fn; + } + else { + using dpctl::tensor::kernels::accumulators::CastTransformer; + fnT fn = dpctl::tensor::kernels::accumulators:: + accumulate_1d_contig_impl, + ScanOpT, include_initial>; + return fn; + } + } + else { + return nullptr; + } + } +}; + +template +struct CumProdStridedFactory +{ + fnT get() + { + if constexpr (TypePairSupportDataForProdAccumulation< + srcTy, dstTy>::is_defined) { + using ScanOpT = CumProdScanOpT; + static constexpr bool include_initial = false; + if constexpr (std::is_same_v) { + using dpctl::tensor::kernels::accumulators::NoOpTransformer; + fnT fn = dpctl::tensor::kernels::accumulators:: + accumulate_strided_impl, ScanOpT, + include_initial>; + return fn; + } + else { + using dpctl::tensor::kernels::accumulators::CastTransformer; + fnT fn = dpctl::tensor::kernels::accumulators:: + accumulate_strided_impl, + ScanOpT, include_initial>; + return fn; + } + } + else { + return nullptr; + } + } +}; + +template +struct CumProdIncludeInitialStridedFactory +{ + fnT get() + { + if constexpr (TypePairSupportDataForProdAccumulation< + srcTy, dstTy>::is_defined) { + using ScanOpT = CumProdScanOpT; + static constexpr bool include_initial = true; + if constexpr (std::is_same_v) { + using dpctl::tensor::kernels::accumulators::NoOpTransformer; + fnT fn = dpctl::tensor::kernels::accumulators:: + accumulate_strided_impl, ScanOpT, + include_initial>; + return fn; + } + else { + using dpctl::tensor::kernels::accumulators::CastTransformer; + fnT fn = dpctl::tensor::kernels::accumulators:: + accumulate_strided_impl, + ScanOpT, include_initial>; + return fn; + } + } + else { + return nullptr; + } + } +}; + +void populate_cumprod_dispatch_tables(void) +{ + td_ns::DispatchTableBuilder + dtb1; + dtb1.populate_dispatch_table(cumprod_1d_contig_dispatch_table); + + td_ns::DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(cumprod_strided_dispatch_table); + + td_ns::DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table( + cumprod_1d_include_initial_contig_dispatch_table); + + td_ns::DispatchTableBuilder + dtb4; + dtb4.populate_dispatch_table( + cumprod_include_initial_strided_dispatch_table); + + return; +} + +} // namespace impl + +void init_cumulative_prod(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + + using impl::populate_cumprod_dispatch_tables; + populate_cumprod_dispatch_tables(); + + using impl::cumprod_1d_contig_dispatch_table; + using impl::cumprod_strided_dispatch_table; + auto cumprod_pyapi = [&](const arrayT &src, int trailing_dims_to_accumulate, + const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_accumulate_over_axis( + src, trailing_dims_to_accumulate, dst, exec_q, depends, + cumprod_strided_dispatch_table, cumprod_1d_contig_dispatch_table); + }; + m.def("_cumprod_over_axis", cumprod_pyapi, "", py::arg("src"), + py::arg("trailing_dims_to_accumulate"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + using impl::cumprod_1d_include_initial_contig_dispatch_table; + using impl::cumprod_include_initial_strided_dispatch_table; + auto cumprod_include_initial_pyapi = + [&](const arrayT &src, const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_accumulate_final_axis_include_initial( + src, dst, exec_q, depends, + cumprod_include_initial_strided_dispatch_table, + cumprod_1d_include_initial_contig_dispatch_table); + }; + m.def("_cumprod_final_axis_include_initial", cumprod_include_initial_pyapi, + "", py::arg("src"), py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + + auto cumprod_dtype_supported = [&](const py::dtype &input_dtype, + const py::dtype &output_dtype) { + return py_accumulate_dtype_supported(input_dtype, output_dtype, + cumprod_strided_dispatch_table); + }; + m.def("_cumprod_dtype_supported", cumprod_dtype_supported, "", + py::arg("arg_dtype"), py::arg("out_dtype")); +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/accumulators/cumulative_prod.hpp b/dpnp/tensor/libtensor/source/accumulators/cumulative_prod.hpp new file mode 100644 index 000000000000..e14bb2c44361 --- /dev/null +++ b/dpnp/tensor/libtensor/source/accumulators/cumulative_prod.hpp @@ -0,0 +1,46 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_accumulation_impl +// extensions +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern void init_cumulative_prod(py::module_ m); + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/accumulators/cumulative_sum.cpp b/dpnp/tensor/libtensor/source/accumulators/cumulative_sum.cpp new file mode 100644 index 000000000000..f700883af2a1 --- /dev/null +++ b/dpnp/tensor/libtensor/source/accumulators/cumulative_sum.cpp @@ -0,0 +1,350 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_accumulation_impl +// extensions +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include +#include + +#include "accumulate_over_axis.hpp" +#include "kernels/accumulators.hpp" +#include "utils/type_dispatch_building.hpp" + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace impl +{ + +using dpctl::tensor::kernels::accumulators::accumulate_1d_contig_impl_fn_ptr_t; +static accumulate_1d_contig_impl_fn_ptr_t + cumsum_1d_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; + +using dpctl::tensor::kernels::accumulators::accumulate_strided_impl_fn_ptr_t; +static accumulate_strided_impl_fn_ptr_t + cumsum_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; + +static accumulate_1d_contig_impl_fn_ptr_t + cumsum_1d_include_initial_contig_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +static accumulate_strided_impl_fn_ptr_t + cumsum_include_initial_strided_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +template +struct TypePairSupportDataForSumAccumulation +{ + static constexpr bool is_defined = std::disjunction< + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input int8_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input uint8_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input int16_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input uint16_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input int32_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input uint32_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input int64_t + td_ns::TypePairDefinedEntry, + + // input uint64_t + td_ns::TypePairDefinedEntry, + + // input half + td_ns::TypePairDefinedEntry, + + // input float + td_ns::TypePairDefinedEntry, + + // input double + td_ns::TypePairDefinedEntry, + + // input std::complex + td_ns::TypePairDefinedEntry, + outTy, + std::complex>, + + td_ns::TypePairDefinedEntry, + outTy, + std::complex>, + + // fall-through + td_ns::NotDefinedEntry>::is_defined; +}; + +template +using CumSumScanOpT = std:: + conditional_t, sycl::logical_or, sycl::plus>; + +template +struct CumSum1DContigFactory +{ + fnT get() + { + if constexpr (TypePairSupportDataForSumAccumulation< + srcTy, dstTy>::is_defined) { + using ScanOpT = CumSumScanOpT; + static constexpr bool include_initial = false; + if constexpr (std::is_same_v) { + using dpctl::tensor::kernels::accumulators::NoOpTransformer; + fnT fn = dpctl::tensor::kernels::accumulators:: + accumulate_1d_contig_impl, ScanOpT, + include_initial>; + return fn; + } + else { + using dpctl::tensor::kernels::accumulators::CastTransformer; + fnT fn = dpctl::tensor::kernels::accumulators:: + accumulate_1d_contig_impl, + ScanOpT, include_initial>; + return fn; + } + } + else { + return nullptr; + } + } +}; + +template +struct CumSum1DIncludeInitialContigFactory +{ + fnT get() + { + if constexpr (TypePairSupportDataForSumAccumulation< + srcTy, dstTy>::is_defined) { + using ScanOpT = CumSumScanOpT; + static constexpr bool include_initial = true; + if constexpr (std::is_same_v) { + using dpctl::tensor::kernels::accumulators::NoOpTransformer; + fnT fn = dpctl::tensor::kernels::accumulators:: + accumulate_1d_contig_impl, ScanOpT, + include_initial>; + return fn; + } + else { + using dpctl::tensor::kernels::accumulators::CastTransformer; + fnT fn = dpctl::tensor::kernels::accumulators:: + accumulate_1d_contig_impl, + ScanOpT, include_initial>; + return fn; + } + } + else { + return nullptr; + } + } +}; + +template +struct CumSumStridedFactory +{ + fnT get() + { + if constexpr (TypePairSupportDataForSumAccumulation< + srcTy, dstTy>::is_defined) { + using ScanOpT = CumSumScanOpT; + static constexpr bool include_initial = false; + if constexpr (std::is_same_v) { + using dpctl::tensor::kernels::accumulators::NoOpTransformer; + fnT fn = dpctl::tensor::kernels::accumulators:: + accumulate_strided_impl, ScanOpT, + include_initial>; + return fn; + } + else { + using dpctl::tensor::kernels::accumulators::CastTransformer; + fnT fn = dpctl::tensor::kernels::accumulators:: + accumulate_strided_impl, + ScanOpT, include_initial>; + return fn; + } + } + else { + return nullptr; + } + } +}; + +template +struct CumSumIncludeInitialStridedFactory +{ + fnT get() + { + if constexpr (TypePairSupportDataForSumAccumulation< + srcTy, dstTy>::is_defined) { + using ScanOpT = CumSumScanOpT; + static constexpr bool include_initial = true; + if constexpr (std::is_same_v) { + using dpctl::tensor::kernels::accumulators::NoOpTransformer; + fnT fn = dpctl::tensor::kernels::accumulators:: + accumulate_strided_impl, ScanOpT, + include_initial>; + return fn; + } + else { + using dpctl::tensor::kernels::accumulators::CastTransformer; + fnT fn = dpctl::tensor::kernels::accumulators:: + accumulate_strided_impl, + ScanOpT, include_initial>; + return fn; + } + } + else { + return nullptr; + } + } +}; + +void populate_cumsum_dispatch_tables(void) +{ + td_ns::DispatchTableBuilder + dtb1; + dtb1.populate_dispatch_table(cumsum_1d_contig_dispatch_table); + + td_ns::DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(cumsum_strided_dispatch_table); + + td_ns::DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table( + cumsum_1d_include_initial_contig_dispatch_table); + + td_ns::DispatchTableBuilder + dtb4; + dtb4.populate_dispatch_table(cumsum_include_initial_strided_dispatch_table); + + return; +} + +} // namespace impl + +void init_cumulative_sum(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + + using impl::populate_cumsum_dispatch_tables; + populate_cumsum_dispatch_tables(); + + using impl::cumsum_1d_contig_dispatch_table; + using impl::cumsum_strided_dispatch_table; + auto cumsum_pyapi = [&](const arrayT &src, int trailing_dims_to_accumulate, + const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_accumulate_over_axis( + src, trailing_dims_to_accumulate, dst, exec_q, depends, + cumsum_strided_dispatch_table, cumsum_1d_contig_dispatch_table); + }; + m.def("_cumsum_over_axis", cumsum_pyapi, "", py::arg("src"), + py::arg("trailing_dims_to_accumulate"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + using impl::cumsum_1d_include_initial_contig_dispatch_table; + using impl::cumsum_include_initial_strided_dispatch_table; + auto cumsum_include_initial_pyapi = + [&](const arrayT &src, const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_accumulate_final_axis_include_initial( + src, dst, exec_q, depends, + cumsum_include_initial_strided_dispatch_table, + cumsum_1d_include_initial_contig_dispatch_table); + }; + m.def("_cumsum_final_axis_include_initial", cumsum_include_initial_pyapi, + "", py::arg("src"), py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + + auto cumsum_dtype_supported = [&](const py::dtype &input_dtype, + const py::dtype &output_dtype) { + return py_accumulate_dtype_supported(input_dtype, output_dtype, + cumsum_strided_dispatch_table); + }; + m.def("_cumsum_dtype_supported", cumsum_dtype_supported, "", + py::arg("arg_dtype"), py::arg("out_dtype")); +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/accumulators/cumulative_sum.hpp b/dpnp/tensor/libtensor/source/accumulators/cumulative_sum.hpp new file mode 100644 index 000000000000..5e06b222a3bc --- /dev/null +++ b/dpnp/tensor/libtensor/source/accumulators/cumulative_sum.hpp @@ -0,0 +1,46 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_accumulation_impl +// extensions +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern void init_cumulative_sum(py::module_ m); + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/boolean_advanced_indexing.cpp b/dpnp/tensor/libtensor/source/boolean_advanced_indexing.cpp new file mode 100644 index 000000000000..146be45e4858 --- /dev/null +++ b/dpnp/tensor/libtensor/source/boolean_advanced_indexing.cpp @@ -0,0 +1,853 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines implementation functions of dpctl.tensor.place and +/// dpctl.tensor.extract, dpctl.tensor.nonzero +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include + +#include "simplify_iteration_space.hpp" +#include "utils/memory_overlap.hpp" +#include "utils/offset_utils.hpp" +#include "utils/output_validation.hpp" +#include "utils/sycl_alloc_utils.hpp" +#include "utils/type_dispatch.hpp" + +#include "boolean_advanced_indexing.hpp" +#include "kernels/boolean_advanced_indexing.hpp" + +namespace dpctl::tensor::py_internal +{ + +// Masked extraction + +namespace td_ns = dpctl::tensor::type_dispatch; + +using dpctl::tensor::kernels::indexing:: + masked_extract_all_slices_strided_impl_fn_ptr_t; + +static masked_extract_all_slices_strided_impl_fn_ptr_t + masked_extract_all_slices_strided_i32_impl_dispatch_vector + [td_ns::num_types]; +static masked_extract_all_slices_strided_impl_fn_ptr_t + masked_extract_all_slices_strided_i64_impl_dispatch_vector + [td_ns::num_types]; + +using dpctl::tensor::kernels::indexing:: + masked_extract_all_slices_contig_impl_fn_ptr_t; + +static masked_extract_all_slices_contig_impl_fn_ptr_t + masked_extract_all_slices_contig_i32_impl_dispatch_vector[td_ns::num_types]; +static masked_extract_all_slices_contig_impl_fn_ptr_t + masked_extract_all_slices_contig_i64_impl_dispatch_vector[td_ns::num_types]; + +using dpctl::tensor::kernels::indexing:: + masked_extract_some_slices_strided_impl_fn_ptr_t; + +static masked_extract_some_slices_strided_impl_fn_ptr_t + masked_extract_some_slices_strided_i32_impl_dispatch_vector + [td_ns::num_types]; +static masked_extract_some_slices_strided_impl_fn_ptr_t + masked_extract_some_slices_strided_i64_impl_dispatch_vector + [td_ns::num_types]; + +void populate_masked_extract_dispatch_vectors(void) +{ + using dpctl::tensor::kernels::indexing:: + MaskExtractAllSlicesStridedFactoryForInt32; + td_ns::DispatchVectorBuilder< + masked_extract_all_slices_strided_impl_fn_ptr_t, + MaskExtractAllSlicesStridedFactoryForInt32, td_ns::num_types> + dvb1; + dvb1.populate_dispatch_vector( + masked_extract_all_slices_strided_i32_impl_dispatch_vector); + + using dpctl::tensor::kernels::indexing:: + MaskExtractAllSlicesStridedFactoryForInt64; + td_ns::DispatchVectorBuilder< + masked_extract_all_slices_strided_impl_fn_ptr_t, + MaskExtractAllSlicesStridedFactoryForInt64, td_ns::num_types> + dvb2; + dvb2.populate_dispatch_vector( + masked_extract_all_slices_strided_i64_impl_dispatch_vector); + + using dpctl::tensor::kernels::indexing:: + MaskExtractSomeSlicesStridedFactoryForInt32; + td_ns::DispatchVectorBuilder< + masked_extract_some_slices_strided_impl_fn_ptr_t, + MaskExtractSomeSlicesStridedFactoryForInt32, td_ns::num_types> + dvb3; + dvb3.populate_dispatch_vector( + masked_extract_some_slices_strided_i32_impl_dispatch_vector); + + using dpctl::tensor::kernels::indexing:: + MaskExtractSomeSlicesStridedFactoryForInt64; + td_ns::DispatchVectorBuilder< + masked_extract_some_slices_strided_impl_fn_ptr_t, + MaskExtractSomeSlicesStridedFactoryForInt64, td_ns::num_types> + dvb4; + dvb4.populate_dispatch_vector( + masked_extract_some_slices_strided_i64_impl_dispatch_vector); + + using dpctl::tensor::kernels::indexing:: + MaskExtractAllSlicesContigFactoryForInt32; + td_ns::DispatchVectorBuilder + dvb5; + dvb5.populate_dispatch_vector( + masked_extract_all_slices_contig_i32_impl_dispatch_vector); + + using dpctl::tensor::kernels::indexing:: + MaskExtractAllSlicesContigFactoryForInt64; + td_ns::DispatchVectorBuilder + dvb6; + dvb6.populate_dispatch_vector( + masked_extract_all_slices_contig_i64_impl_dispatch_vector); +} + +std::pair + py_extract(const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &cumsum, + int axis_start, // axis_start <= mask_i < axis_end + int axis_end, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends) +{ + dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst); + + int src_nd = src.get_ndim(); + if ((axis_start < 0 || axis_end > src_nd || axis_start >= axis_end)) { + throw py::value_error("Specified axes_start and axes_end are invalid."); + } + int mask_span_sz = axis_end - axis_start; + + int dst_nd = dst.get_ndim(); + if (src_nd != dst_nd + (mask_span_sz - 1)) { + throw py::value_error("Number of dimensions of source and destination " + "arrays is not consistent"); + } + + if (!cumsum.is_c_contiguous() || cumsum.get_ndim() != 1) { + throw py::value_error("cumsum array must be a C-contiguous vector"); + } + + if (!dpctl::utils::queues_are_compatible(exec_q, {src, cumsum, dst})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + py::ssize_t cumsum_sz = cumsum.get_size(); + + const py::ssize_t *src_shape = src.get_shape_raw(); + const py::ssize_t *dst_shape = dst.get_shape_raw(); + bool same_ortho_dims(true); + std::size_t ortho_nelems(1); // number of orthogonal iterations + + for (auto i = 0; i < axis_start; ++i) { + auto src_sh_i = src_shape[i]; + ortho_nelems *= src_sh_i; + same_ortho_dims = same_ortho_dims && (src_sh_i == dst_shape[i]); + } + for (auto i = axis_end; i < src_nd; ++i) { + auto src_sh_i = src_shape[i]; + ortho_nelems *= src_sh_i; + same_ortho_dims = + same_ortho_dims && (src_sh_i == dst_shape[i - (mask_span_sz - 1)]); + } + + std::size_t masked_src_nelems(1); + std::size_t masked_dst_nelems(dst_shape[axis_start]); + for (auto i = axis_start; i < axis_end; ++i) { + masked_src_nelems *= src_shape[i]; + } + + // masked_dst_nelems is number of set elements in the mask, or last element + // in cumsum + if (!same_ortho_dims || + (masked_src_nelems != static_cast(cumsum_sz))) { + throw py::value_error("Inconsistent array dimensions"); + } + + dpctl::tensor::validation::AmpleMemory::throw_if_not_ample( + dst, ortho_nelems * masked_dst_nelems); + + auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); + // check that dst does not intersect with src, not with cumsum. + if (overlap(dst, cumsum) || overlap(dst, src)) { + throw py::value_error("Destination array overlaps with inputs"); + } + + int src_typenum = src.get_typenum(); + int dst_typenum = dst.get_typenum(); + int cumsum_typenum = cumsum.get_typenum(); + + auto const &array_types = td_ns::usm_ndarray_types(); + int src_typeid = array_types.typenum_to_lookup_id(src_typenum); + int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum); + int cumsum_typeid = array_types.typenum_to_lookup_id(cumsum_typenum); + + static constexpr int int32_typeid = + static_cast(td_ns::typenum_t::INT32); + static constexpr int int64_typeid = + static_cast(td_ns::typenum_t::INT64); + if (cumsum_typeid != int32_typeid && cumsum_typeid != int64_typeid) { + throw py::value_error("Unexpected data type of cumsum array, expecting " + "'int32' or 'int64'"); + } + + const bool use_i32 = (cumsum_typeid == int32_typeid); + + if (src_typeid != dst_typeid) { + throw py::value_error( + "Destination array must have the same elemental data types"); + } + + char *src_data_p = src.get_data(); + char *dst_data_p = dst.get_data(); + char *cumsum_data_p = cumsum.get_data(); + + auto src_shape_vec = src.get_shape_vector(); + auto src_strides_vec = src.get_strides_vector(); + + auto dst_shape_vec = dst.get_shape_vector(); + auto dst_strides_vec = dst.get_strides_vector(); + + sycl::event extract_ev; + std::vector host_task_events{}; + if (axis_start == 0 && axis_end == src_nd) { + assert(dst_shape_vec.size() == 1); + assert(dst_strides_vec.size() == 1); + + if (src.is_c_contiguous()) { + auto fn = + (use_i32) + ? masked_extract_all_slices_contig_i32_impl_dispatch_vector + [src_typeid] + : masked_extract_all_slices_contig_i64_impl_dispatch_vector + [src_typeid]; + + extract_ev = + fn(exec_q, cumsum_sz, src_data_p, cumsum_data_p, dst_data_p, + dst_shape_vec[0], dst_strides_vec[0], depends); + + // + host_task_events.push_back(extract_ev); + } + else { + // empty orthogonal directions + auto fn = + (use_i32) + ? masked_extract_all_slices_strided_i32_impl_dispatch_vector + [src_typeid] + : masked_extract_all_slices_strided_i64_impl_dispatch_vector + [src_typeid]; + + using dpctl::tensor::offset_utils::device_allocate_and_pack; + auto ptr_size_event_tuple1 = device_allocate_and_pack( + exec_q, host_task_events, src_shape_vec, src_strides_vec); + auto packed_src_shape_strides_owner = + std::move(std::get<0>(ptr_size_event_tuple1)); + sycl::event copy_src_shape_strides_ev = + std::get<2>(ptr_size_event_tuple1); + const py::ssize_t *packed_src_shape_strides = + packed_src_shape_strides_owner.get(); + + std::vector all_deps; + all_deps.reserve(depends.size() + 1); + all_deps.insert(all_deps.end(), depends.begin(), depends.end()); + all_deps.push_back(copy_src_shape_strides_ev); + + assert(all_deps.size() == depends.size() + 1); + + extract_ev = fn(exec_q, cumsum_sz, src_data_p, cumsum_data_p, + dst_data_p, src_nd, packed_src_shape_strides, + dst_shape_vec[0], dst_strides_vec[0], all_deps); + + sycl::event cleanup_tmp_allocations_ev = + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {extract_ev}, packed_src_shape_strides_owner); + host_task_events.push_back(cleanup_tmp_allocations_ev); + } + } + else { + // non-empty orthogonal directions + auto fn = + (use_i32) + ? masked_extract_some_slices_strided_i32_impl_dispatch_vector + [src_typeid] + : masked_extract_some_slices_strided_i64_impl_dispatch_vector + [src_typeid]; + + int masked_src_nd = mask_span_sz; + int ortho_nd = src_nd - masked_src_nd; + + using shT = std::vector; + + shT ortho_src_shape; + shT masked_src_shape; + shT ortho_src_strides; + shT masked_src_strides; + split_iteration_space(src_shape_vec, src_strides_vec, axis_start, + axis_end, ortho_src_shape, + masked_src_shape, // 4 vectors modified + ortho_src_strides, masked_src_strides); + + shT ortho_dst_shape; + shT masked_dst_shape; + shT ortho_dst_strides; + shT masked_dst_strides; + split_iteration_space(dst_shape_vec, dst_strides_vec, axis_start, + axis_start + 1, ortho_dst_shape, + masked_dst_shape, // 4 vectors modified + ortho_dst_strides, masked_dst_strides); + + assert(ortho_src_shape.size() == static_cast(ortho_nd)); + assert(ortho_dst_shape.size() == static_cast(ortho_nd)); + assert(std::equal(ortho_src_shape.begin(), ortho_src_shape.end(), + ortho_dst_shape.begin())); + + std::vector simplified_ortho_shape; + std::vector simplified_ortho_src_strides; + std::vector simplified_ortho_dst_strides; + + const py::ssize_t *_shape = ortho_src_shape.data(); + + py::ssize_t ortho_src_offset(0); + py::ssize_t ortho_dst_offset(0); + + simplify_iteration_space( + ortho_nd, _shape, ortho_src_strides, ortho_dst_strides, + // output + simplified_ortho_shape, simplified_ortho_src_strides, + simplified_ortho_dst_strides, ortho_src_offset, ortho_dst_offset); + + assert(masked_dst_shape.size() == 1); + assert(masked_dst_strides.size() == 1); + + using dpctl::tensor::offset_utils::device_allocate_and_pack; + auto ptr_size_event_tuple1 = device_allocate_and_pack( + exec_q, host_task_events, simplified_ortho_shape, + simplified_ortho_src_strides, simplified_ortho_dst_strides, + masked_src_shape, masked_src_strides); + auto packed_shapes_strides_owner = + std::move(std::get<0>(ptr_size_event_tuple1)); + sycl::event copy_shapes_strides_ev = std::get<2>(ptr_size_event_tuple1); + const py::ssize_t *packed_shapes_strides = + packed_shapes_strides_owner.get(); + + const py::ssize_t *packed_ortho_src_dst_shape_strides = + packed_shapes_strides; + const py::ssize_t *packed_masked_src_shape_strides = + packed_shapes_strides + (3 * ortho_nd); + + std::vector all_deps; + all_deps.reserve(depends.size() + 1); + all_deps.insert(all_deps.end(), depends.begin(), depends.end()); + all_deps.push_back(copy_shapes_strides_ev); + + assert(all_deps.size() == depends.size() + 1); + + // OrthogIndexerT orthog_src_dst_indexer_, MaskedIndexerT + // masked_src_indexer_, MaskedIndexerT masked_dst_indexer_ + extract_ev = fn(exec_q, ortho_nelems, masked_src_nelems, src_data_p, + cumsum_data_p, dst_data_p, + // data to build orthog_src_dst_indexer + ortho_nd, packed_ortho_src_dst_shape_strides, + ortho_src_offset, ortho_dst_offset, + // data to build masked_src_indexer + masked_src_nd, packed_masked_src_shape_strides, + // data to build masked_dst_indexer, + masked_dst_shape[0], masked_dst_strides[0], all_deps); + + sycl::event cleanup_tmp_allocations_ev = + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {extract_ev}, packed_shapes_strides_owner); + host_task_events.push_back(cleanup_tmp_allocations_ev); + } + + sycl::event py_obj_management_host_task_ev = dpctl::utils::keep_args_alive( + exec_q, {src, cumsum, dst}, host_task_events); + + return std::make_pair(py_obj_management_host_task_ev, extract_ev); +} + +// Masked placement + +using dpctl::tensor::kernels::indexing:: + masked_place_all_slices_strided_impl_fn_ptr_t; + +static masked_place_all_slices_strided_impl_fn_ptr_t + masked_place_all_slices_strided_i32_impl_dispatch_vector[td_ns::num_types]; +static masked_place_all_slices_strided_impl_fn_ptr_t + masked_place_all_slices_strided_i64_impl_dispatch_vector[td_ns::num_types]; + +using dpctl::tensor::kernels::indexing:: + masked_place_some_slices_strided_impl_fn_ptr_t; + +static masked_place_some_slices_strided_impl_fn_ptr_t + masked_place_some_slices_strided_i32_impl_dispatch_vector[td_ns::num_types]; +static masked_place_some_slices_strided_impl_fn_ptr_t + masked_place_some_slices_strided_i64_impl_dispatch_vector[td_ns::num_types]; + +void populate_masked_place_dispatch_vectors(void) +{ + using dpctl::tensor::kernels::indexing:: + MaskPlaceAllSlicesStridedFactoryForInt32; + td_ns::DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector( + masked_place_all_slices_strided_i32_impl_dispatch_vector); + + using dpctl::tensor::kernels::indexing:: + MaskPlaceAllSlicesStridedFactoryForInt64; + td_ns::DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector( + masked_place_all_slices_strided_i64_impl_dispatch_vector); + + using dpctl::tensor::kernels::indexing:: + MaskPlaceSomeSlicesStridedFactoryForInt32; + td_ns::DispatchVectorBuilder + dvb3; + dvb3.populate_dispatch_vector( + masked_place_some_slices_strided_i32_impl_dispatch_vector); + + using dpctl::tensor::kernels::indexing:: + MaskPlaceSomeSlicesStridedFactoryForInt64; + td_ns::DispatchVectorBuilder + dvb4; + dvb4.populate_dispatch_vector( + masked_place_some_slices_strided_i64_impl_dispatch_vector); +} + +/* + * @brief Copy dst[i, ortho_id] = rhs[cumsum[i] - 1, ortho_id] if cumsum[i] == + * ((i > 0) ? cumsum[i-1] + 1 : 1) + */ +std::pair + py_place(const dpctl::tensor::usm_ndarray &dst, + const dpctl::tensor::usm_ndarray &cumsum, + int axis_start, // axis_start <= mask_i < axis_end + int axis_end, + const dpctl::tensor::usm_ndarray &rhs, + sycl::queue &exec_q, + const std::vector &depends) +{ + dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst); + + int dst_nd = dst.get_ndim(); + if ((axis_start < 0 || axis_end > dst_nd || axis_start >= axis_end)) { + throw py::value_error("Specified axes_start and axes_end are invalid."); + } + int mask_span_sz = axis_end - axis_start; + + int rhs_nd = rhs.get_ndim(); + if (dst_nd != rhs_nd + (mask_span_sz - 1)) { + throw py::value_error("Number of dimensions of source and destination " + "arrays is not consistent"); + } + + if (!cumsum.is_c_contiguous() || cumsum.get_ndim() != 1) { + throw py::value_error("cumsum array must be a C-contiguous vector"); + } + + if (!dpctl::utils::queues_are_compatible(exec_q, {dst, cumsum, rhs})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + py::ssize_t cumsum_sz = cumsum.get_size(); + + const py::ssize_t *dst_shape = dst.get_shape_raw(); + const py::ssize_t *rhs_shape = rhs.get_shape_raw(); + bool same_ortho_dims(true); + std::size_t ortho_nelems(1); // number of orthogonal iterations + + for (auto i = 0; i < axis_start; ++i) { + auto dst_sh_i = dst_shape[i]; + ortho_nelems *= dst_sh_i; + same_ortho_dims = same_ortho_dims && (dst_sh_i == rhs_shape[i]); + } + for (auto i = axis_end; i < dst_nd; ++i) { + auto dst_sh_i = dst_shape[i]; + ortho_nelems *= dst_sh_i; + same_ortho_dims = + same_ortho_dims && (dst_sh_i == rhs_shape[i - (mask_span_sz - 1)]); + } + + std::size_t masked_dst_nelems(1); + for (auto i = axis_start; i < axis_end; ++i) { + masked_dst_nelems *= dst_shape[i]; + } + + if (!same_ortho_dims || + (masked_dst_nelems != static_cast(cumsum_sz))) { + throw py::value_error("Inconsistent array dimensions"); + } + + dpctl::tensor::validation::AmpleMemory::throw_if_not_ample( + dst, ortho_nelems * masked_dst_nelems); + + auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); + // check that dst does not intersect with src, not with cumsum. + if (overlap(dst, rhs) || overlap(dst, cumsum)) { + throw py::value_error("Destination array overlaps with inputs"); + } + + int dst_typenum = dst.get_typenum(); + int rhs_typenum = rhs.get_typenum(); + int cumsum_typenum = cumsum.get_typenum(); + + auto const &array_types = td_ns::usm_ndarray_types(); + int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum); + int rhs_typeid = array_types.typenum_to_lookup_id(rhs_typenum); + int cumsum_typeid = array_types.typenum_to_lookup_id(cumsum_typenum); + + static constexpr int int32_typeid = + static_cast(td_ns::typenum_t::INT32); + static constexpr int int64_typeid = + static_cast(td_ns::typenum_t::INT64); + if (cumsum_typeid != int32_typeid && cumsum_typeid != int64_typeid) { + throw py::value_error("Unexpected data type of cumsum array, expecting " + "'int32' or 'int64'"); + } + + const bool use_i32 = (cumsum_typeid == int32_typeid); + + if (dst_typeid != rhs_typeid) { + throw py::value_error( + "Destination array must have the same elemental data types"); + } + + char *dst_data_p = dst.get_data(); + char *rhs_data_p = rhs.get_data(); + char *cumsum_data_p = cumsum.get_data(); + + auto dst_shape_vec = dst.get_shape_vector(); + auto dst_strides_vec = dst.get_strides_vector(); + + auto rhs_shape_vec = rhs.get_shape_vector(); + auto rhs_strides_vec = rhs.get_strides_vector(); + + sycl::event place_ev; + std::vector host_task_events{}; + if (axis_start == 0 && axis_end == dst_nd) { + // empty orthogonal directions + auto fn = (use_i32) + ? masked_place_all_slices_strided_i32_impl_dispatch_vector + [dst_typeid] + : masked_place_all_slices_strided_i64_impl_dispatch_vector + [dst_typeid]; + + assert(rhs_shape_vec.size() == 1); + assert(rhs_strides_vec.size() == 1); + + using dpctl::tensor::offset_utils::device_allocate_and_pack; + auto ptr_size_event_tuple1 = device_allocate_and_pack( + exec_q, host_task_events, dst_shape_vec, dst_strides_vec); + auto packed_dst_shape_strides_owner = + std::move(std::get<0>(ptr_size_event_tuple1)); + sycl::event copy_dst_shape_strides_ev = + std::get<2>(ptr_size_event_tuple1); + const py::ssize_t *packed_dst_shape_strides = + packed_dst_shape_strides_owner.get(); + + std::vector all_deps; + all_deps.reserve(depends.size() + 1); + all_deps.insert(all_deps.end(), depends.begin(), depends.end()); + all_deps.push_back(copy_dst_shape_strides_ev); + + assert(all_deps.size() == depends.size() + 1); + + place_ev = fn(exec_q, cumsum_sz, dst_data_p, cumsum_data_p, rhs_data_p, + dst_nd, packed_dst_shape_strides, rhs_shape_vec[0], + rhs_strides_vec[0], all_deps); + + sycl::event cleanup_tmp_allocations_ev = + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {place_ev}, packed_dst_shape_strides_owner); + host_task_events.push_back(cleanup_tmp_allocations_ev); + } + else { + // non-empty orthogonal directions + auto fn = + (use_i32) + ? masked_place_some_slices_strided_i32_impl_dispatch_vector + [dst_typeid] + : masked_place_some_slices_strided_i64_impl_dispatch_vector + [dst_typeid]; + + int masked_dst_nd = mask_span_sz; + int ortho_nd = dst_nd - masked_dst_nd; + + using shT = std::vector; + + shT ortho_dst_shape; + shT masked_dst_shape; + shT ortho_dst_strides; + shT masked_dst_strides; + split_iteration_space(dst_shape_vec, dst_strides_vec, axis_start, + axis_end, ortho_dst_shape, + masked_dst_shape, // 4 vectors modified + ortho_dst_strides, masked_dst_strides); + + shT ortho_rhs_shape; + shT masked_rhs_shape; + shT ortho_rhs_strides; + shT masked_rhs_strides; + split_iteration_space(rhs_shape_vec, rhs_strides_vec, axis_start, + axis_start + 1, ortho_rhs_shape, + masked_rhs_shape, // 4 vectors modified + ortho_rhs_strides, masked_rhs_strides); + + assert(ortho_dst_shape.size() == static_cast(ortho_nd)); + assert(ortho_rhs_shape.size() == static_cast(ortho_nd)); + assert(std::equal(ortho_dst_shape.begin(), ortho_dst_shape.end(), + ortho_rhs_shape.begin())); + + std::vector simplified_ortho_shape; + std::vector simplified_ortho_dst_strides; + std::vector simplified_ortho_rhs_strides; + + const py::ssize_t *_shape = ortho_dst_shape.data(); + + py::ssize_t ortho_dst_offset(0); + py::ssize_t ortho_rhs_offset(0); + + simplify_iteration_space( + ortho_nd, _shape, ortho_dst_strides, ortho_rhs_strides, + simplified_ortho_shape, simplified_ortho_dst_strides, + simplified_ortho_rhs_strides, ortho_dst_offset, ortho_rhs_offset); + + assert(masked_rhs_shape.size() == 1); + assert(masked_rhs_strides.size() == 1); + + using dpctl::tensor::offset_utils::device_allocate_and_pack; + auto ptr_size_event_tuple1 = device_allocate_and_pack( + exec_q, host_task_events, simplified_ortho_shape, + simplified_ortho_dst_strides, simplified_ortho_rhs_strides, + masked_dst_shape, masked_dst_strides); + auto packed_shapes_strides_owner = + std::move(std::get<0>(ptr_size_event_tuple1)); + sycl::event copy_shapes_strides_ev = std::get<2>(ptr_size_event_tuple1); + const py::ssize_t *packed_shapes_strides = + packed_shapes_strides_owner.get(); + + const py::ssize_t *packed_ortho_dst_rhs_shape_strides = + packed_shapes_strides; + const py::ssize_t *packed_masked_dst_shape_strides = + packed_shapes_strides + (3 * ortho_nd); + + std::vector all_deps; + all_deps.reserve(depends.size() + 1); + all_deps.insert(all_deps.end(), depends.begin(), depends.end()); + all_deps.push_back(copy_shapes_strides_ev); + + assert(all_deps.size() == depends.size() + 1); + + place_ev = fn(exec_q, ortho_nelems, masked_dst_nelems, dst_data_p, + cumsum_data_p, rhs_data_p, + // data to build orthog_dst_rhs_indexer + ortho_nd, packed_ortho_dst_rhs_shape_strides, + ortho_dst_offset, ortho_rhs_offset, + // data to build masked_dst_indexer + masked_dst_nd, packed_masked_dst_shape_strides, + // data to build masked_dst_indexer, + masked_rhs_shape[0], masked_rhs_strides[0], all_deps); + + sycl::event cleanup_tmp_allocations_ev = + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {place_ev}, packed_shapes_strides_owner); + host_task_events.push_back(cleanup_tmp_allocations_ev); + } + + sycl::event py_obj_management_host_task_ev = dpctl::utils::keep_args_alive( + exec_q, {dst, cumsum, rhs}, host_task_events); + + return std::make_pair(py_obj_management_host_task_ev, place_ev); +} + +// Non-zero + +std::pair + py_nonzero(const dpctl::tensor::usm_ndarray + &cumsum, // int32/int64 input array, 1D, C-contiguous + const dpctl::tensor::usm_ndarray + &indexes, // int32/int64 2D output array, C-contiguous + const std::vector + &mask_shape, // shape of array from which cumsum was computed + sycl::queue &exec_q, + const std::vector &depends) +{ + if (!dpctl::utils::queues_are_compatible(exec_q, {cumsum, indexes})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + dpctl::tensor::validation::CheckWritable::throw_if_not_writable(indexes); + + int cumsum_nd = cumsum.get_ndim(); + if (cumsum_nd != 1 || !cumsum.is_c_contiguous()) { + throw py::value_error("Cumsum array must be a C-contiguous vector"); + } + + int indexes_nd = indexes.get_ndim(); + if (indexes_nd != 2 || !indexes.is_c_contiguous()) { + throw py::value_error("Index array must be a C-contiguous matrix"); + } + + std::size_t _ndim = mask_shape.size(); + if (_ndim > std::numeric_limits::max()) { + throw py::value_error("Shape is too large"); + } + int ndim = static_cast(_ndim); + + const py::ssize_t *indexes_shape = indexes.get_shape_raw(); + + if (ndim != indexes_shape[0]) { + throw py::value_error( + "Length of shape must equal width of index matrix"); + } + + auto cumsum_sz = cumsum.get_size(); + py::ssize_t shape_nelems = + std::accumulate(mask_shape.begin(), mask_shape.end(), py::ssize_t(1), + std::multiplies()); + + if (cumsum_sz != shape_nelems) { + throw py::value_error("Shape and cumsum size are not consistent"); + } + + py::ssize_t nz_elems = indexes_shape[1]; + + int indexes_typenum = indexes.get_typenum(); + auto const &array_types = td_ns::usm_ndarray_types(); + int indexes_typeid = array_types.typenum_to_lookup_id(indexes_typenum); + + int cumsum_typenum = cumsum.get_typenum(); + int cumsum_typeid = array_types.typenum_to_lookup_id(cumsum_typenum); + + constexpr int int32_typeid = static_cast(td_ns::typenum_t::INT32); + constexpr int int64_typeid = static_cast(td_ns::typenum_t::INT64); + + // cumsum must be int32_t or int64_t only + if ((cumsum_typeid != int32_typeid && cumsum_typeid != int64_typeid) || + (indexes_typeid != int32_typeid && indexes_typeid != int64_typeid)) { + throw py::value_error("Cumulative sum array and index array must have " + "int32 or int64 data-type"); + } + + if (cumsum_sz == 0) { + return std::make_pair(sycl::event(), sycl::event()); + } + + auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); + if (overlap(cumsum, indexes)) { + throw py::value_error("Arrays are expected to ave no memory overlap"); + } + + dpctl::tensor::validation::AmpleMemory::throw_if_not_ample( + indexes, nz_elems * _ndim); + + std::vector host_task_events; + host_task_events.reserve(2); + + using dpctl::tensor::offset_utils::device_allocate_and_pack; + auto mask_shape_copying_tuple = device_allocate_and_pack( + exec_q, host_task_events, mask_shape); + auto src_shape_device_owner = + std::move(std::get<0>(mask_shape_copying_tuple)); + sycl::event copy_ev = std::get<2>(mask_shape_copying_tuple); + const py::ssize_t *src_shape_device_ptr = src_shape_device_owner.get(); + + std::vector all_deps; + all_deps.reserve(depends.size() + 1); + + all_deps.insert(all_deps.end(), depends.begin(), depends.end()); + all_deps.push_back(copy_ev); + + using dpctl::tensor::kernels::indexing::non_zero_indexes_fn_ptr_t; + using dpctl::tensor::kernels::indexing::non_zero_indexes_impl; + + int fn_index = ((cumsum_typeid == int64_typeid) ? 1 : 0) + + ((indexes_typeid == int64_typeid) ? 2 : 0); + std::array fn_impls = { + non_zero_indexes_impl, + non_zero_indexes_impl, + non_zero_indexes_impl, + non_zero_indexes_impl}; + auto fn = fn_impls[fn_index]; + + sycl::event non_zero_indexes_ev = + fn(exec_q, cumsum_sz, nz_elems, ndim, cumsum.get_data(), + indexes.get_data(), src_shape_device_ptr, all_deps); + + sycl::event temporaries_cleanup_ev = + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {non_zero_indexes_ev}, src_shape_device_owner); + host_task_events.push_back(temporaries_cleanup_ev); + + sycl::event py_obj_management_host_task_ev = dpctl::utils::keep_args_alive( + exec_q, {cumsum, indexes}, host_task_events); + + return std::make_pair(py_obj_management_host_task_ev, non_zero_indexes_ev); +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/boolean_advanced_indexing.hpp b/dpnp/tensor/libtensor/source/boolean_advanced_indexing.hpp new file mode 100644 index 000000000000..71eafc77b00c --- /dev/null +++ b/dpnp/tensor/libtensor/source/boolean_advanced_indexing.hpp @@ -0,0 +1,81 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===----------------------------------------------------------------------===// + +#pragma once +#include +#include + +#include + +#include "dpnp4pybind11.hpp" +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern std::pair + py_extract(const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &cumsum, + int axis_start, // axis_start <= mask_i < axis_end + int axis_end, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends = {}); + +extern void populate_masked_extract_dispatch_vectors(void); + +extern std::pair + py_place(const dpctl::tensor::usm_ndarray &dst, + const dpctl::tensor::usm_ndarray &cumsum, + int axis_start, // axis_start <= mask_i < axis_end + int axis_end, + const dpctl::tensor::usm_ndarray &rhs, + sycl::queue &exec_q, + const std::vector &depends = {}); + +extern void populate_masked_place_dispatch_vectors(void); + +extern std::pair + py_nonzero(const dpctl::tensor::usm_ndarray + &cumsum, // int32 input array, 1D, C-contiguous + const dpctl::tensor::usm_ndarray + &indexes, // int32 2D output array, C-contiguous + const std::vector + &mask_shape, // shape of array from which cumsum was computed + sycl::queue &exec_q, + const std::vector &depends = {}); + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/clip.cpp b/dpnp/tensor/libtensor/source/clip.cpp new file mode 100644 index 000000000000..4a0e5b9357de --- /dev/null +++ b/dpnp/tensor/libtensor/source/clip.cpp @@ -0,0 +1,263 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines Python API for implementation functions of +/// dpctl.tensor.clip +//===---------------------------------------------------------------------===// + +#include +#include +#include +#include +#include + +#include + +#include "dpnp4pybind11.hpp" +#include + +#include "clip.hpp" +#include "kernels/clip.hpp" +#include "simplify_iteration_space.hpp" +#include "utils/memory_overlap.hpp" +#include "utils/offset_utils.hpp" +#include "utils/output_validation.hpp" +#include "utils/sycl_alloc_utils.hpp" +#include "utils/type_dispatch.hpp" + +namespace dpctl::tensor::py_internal +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +using dpctl::tensor::kernels::clip::clip_contig_impl_fn_ptr_t; +using dpctl::tensor::kernels::clip::clip_strided_impl_fn_ptr_t; + +static clip_contig_impl_fn_ptr_t clip_contig_dispatch_vector[td_ns::num_types]; +static clip_strided_impl_fn_ptr_t + clip_strided_dispatch_vector[td_ns::num_types]; + +void init_clip_dispatch_vectors(void) +{ + using namespace td_ns; + using dpctl::tensor::kernels::clip::ClipContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(clip_contig_dispatch_vector); + + using dpctl::tensor::kernels::clip::ClipStridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(clip_strided_dispatch_vector); +} + +using dpctl::utils::keep_args_alive; + +std::pair + py_clip(const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &min, + const dpctl::tensor::usm_ndarray &max, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends) +{ + + if (!dpctl::utils::queues_are_compatible(exec_q, {src, min, max, dst})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst); + + int nd = src.get_ndim(); + int min_nd = min.get_ndim(); + int max_nd = max.get_ndim(); + int dst_nd = dst.get_ndim(); + + if (nd != min_nd || nd != max_nd) { + throw py::value_error( + "Input arrays are not of appropriate dimension for clip kernel."); + } + + if (nd != dst_nd) { + throw py::value_error( + "Destination is not of appropriate dimension for clip kernel."); + } + + const py::ssize_t *src_shape = src.get_shape_raw(); + const py::ssize_t *min_shape = min.get_shape_raw(); + const py::ssize_t *max_shape = max.get_shape_raw(); + const py::ssize_t *dst_shape = dst.get_shape_raw(); + + bool shapes_equal(true); + std::size_t nelems(1); + for (int i = 0; i < nd; ++i) { + const auto &sh_i = dst_shape[i]; + nelems *= static_cast(sh_i); + shapes_equal = shapes_equal && (min_shape[i] == sh_i) && + (max_shape[i] == sh_i) && (src_shape[i] == sh_i); + } + + if (!shapes_equal) { + throw py::value_error("Arrays are not of matching shapes."); + } + + if (nelems == 0) { + return std::make_pair(sycl::event{}, sycl::event{}); + } + + auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); + auto const &same_logical_tensors = + dpctl::tensor::overlap::SameLogicalTensors(); + if ((overlap(dst, src) && !same_logical_tensors(dst, src)) || + (overlap(dst, min) && !same_logical_tensors(dst, min)) || + (overlap(dst, max) && !same_logical_tensors(dst, max))) { + throw py::value_error("Destination array overlaps with input."); + } + + int min_typenum = min.get_typenum(); + int max_typenum = max.get_typenum(); + int src_typenum = src.get_typenum(); + int dst_typenum = dst.get_typenum(); + + auto const &array_types = td_ns::usm_ndarray_types(); + int src_typeid = array_types.typenum_to_lookup_id(src_typenum); + int min_typeid = array_types.typenum_to_lookup_id(min_typenum); + int max_typeid = array_types.typenum_to_lookup_id(max_typenum); + int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum); + + if (src_typeid != dst_typeid || src_typeid != min_typeid || + src_typeid != max_typeid) { + throw py::value_error("Input, min, max, and destination arrays must " + "have the same data type"); + } + + dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(dst, nelems); + + char *src_data = src.get_data(); + char *min_data = min.get_data(); + char *max_data = max.get_data(); + char *dst_data = dst.get_data(); + + bool is_min_c_contig = min.is_c_contiguous(); + bool is_min_f_contig = min.is_f_contiguous(); + + bool is_max_c_contig = max.is_c_contiguous(); + bool is_max_f_contig = max.is_f_contiguous(); + + bool is_src_c_contig = src.is_c_contiguous(); + bool is_src_f_contig = src.is_f_contiguous(); + + bool is_dst_c_contig = dst.is_c_contiguous(); + bool is_dst_f_contig = dst.is_f_contiguous(); + + bool all_c_contig = (is_min_c_contig && is_max_c_contig && + is_src_c_contig && is_dst_c_contig); + bool all_f_contig = (is_min_f_contig && is_max_f_contig && + is_src_f_contig && is_dst_f_contig); + + if (all_c_contig || all_f_contig) { + auto fn = clip_contig_dispatch_vector[src_typeid]; + + sycl::event clip_ev = + fn(exec_q, nelems, src_data, min_data, max_data, dst_data, depends); + sycl::event ht_ev = + keep_args_alive(exec_q, {src, min, max, dst}, {clip_ev}); + + return std::make_pair(ht_ev, clip_ev); + } + + auto const &src_strides = src.get_strides_vector(); + auto const &min_strides = min.get_strides_vector(); + auto const &max_strides = max.get_strides_vector(); + auto const &dst_strides = dst.get_strides_vector(); + + using shT = std::vector; + shT simplified_shape; + shT simplified_src_strides; + shT simplified_min_strides; + shT simplified_max_strides; + shT simplified_dst_strides; + py::ssize_t src_offset(0); + py::ssize_t min_offset(0); + py::ssize_t max_offset(0); + py::ssize_t dst_offset(0); + + simplify_iteration_space_4( + nd, src_shape, src_strides, min_strides, max_strides, dst_strides, + // outputs + simplified_shape, simplified_src_strides, simplified_min_strides, + simplified_max_strides, simplified_dst_strides, src_offset, min_offset, + max_offset, dst_offset); + + auto fn = clip_strided_dispatch_vector[src_typeid]; + + std::vector host_task_events; + host_task_events.reserve(2); + + using dpctl::tensor::offset_utils::device_allocate_and_pack; + auto ptr_size_event_tuple = device_allocate_and_pack( + exec_q, host_task_events, + // common shape and strides + simplified_shape, simplified_src_strides, simplified_min_strides, + simplified_max_strides, simplified_dst_strides); + auto packed_shape_strides_owner = + std::move(std::get<0>(ptr_size_event_tuple)); + sycl::event copy_shape_strides_ev = std::get<2>(ptr_size_event_tuple); + const py::ssize_t *packed_shape_strides = packed_shape_strides_owner.get(); + + std::vector all_deps; + all_deps.reserve(depends.size() + 1); + all_deps.insert(all_deps.end(), depends.begin(), depends.end()); + all_deps.push_back(copy_shape_strides_ev); + + assert(all_deps.size() == depends.size() + 1); + + sycl::event clip_ev = fn(exec_q, nelems, nd, src_data, min_data, max_data, + dst_data, packed_shape_strides, src_offset, + min_offset, max_offset, dst_offset, all_deps); + + // free packed temporaries + sycl::event temporaries_cleanup_ev = + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {clip_ev}, packed_shape_strides_owner); + host_task_events.push_back(temporaries_cleanup_ev); + + sycl::event arg_cleanup_ev = + keep_args_alive(exec_q, {src, min, max, dst}, host_task_events); + + return std::make_pair(arg_cleanup_ev, clip_ev); +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/clip.hpp b/dpnp/tensor/libtensor/source/clip.hpp new file mode 100644 index 000000000000..de8f0e559b6e --- /dev/null +++ b/dpnp/tensor/libtensor/source/clip.hpp @@ -0,0 +1,57 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines Python API for implementation functions of +/// dpctl.tensor.clip +//===---------------------------------------------------------------------===// + +#pragma once +#include +#include + +#include + +#include "dpnp4pybind11.hpp" + +namespace dpctl::tensor::py_internal +{ + +extern std::pair + py_clip(const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &min, + const dpctl::tensor::usm_ndarray &max, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends); + +extern void init_clip_dispatch_vectors(void); + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/copy_and_cast_usm_to_usm.cpp b/dpnp/tensor/libtensor/source/copy_and_cast_usm_to_usm.cpp new file mode 100644 index 000000000000..7c2db989b0c2 --- /dev/null +++ b/dpnp/tensor/libtensor/source/copy_and_cast_usm_to_usm.cpp @@ -0,0 +1,296 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include +#include +#include + +#include "dpnp4pybind11.hpp" +#include + +#include "kernels/copy_and_cast.hpp" +#include "utils/memory_overlap.hpp" +#include "utils/offset_utils.hpp" +#include "utils/output_validation.hpp" +#include "utils/sycl_alloc_utils.hpp" +#include "utils/type_dispatch.hpp" + +#include "copy_as_contig.hpp" +#include "simplify_iteration_space.hpp" + +namespace dpctl::tensor::py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +using dpctl::tensor::kernels::copy_and_cast::copy_and_cast_1d_fn_ptr_t; +using dpctl::tensor::kernels::copy_and_cast::copy_and_cast_contig_fn_ptr_t; +using dpctl::tensor::kernels::copy_and_cast::copy_and_cast_generic_fn_ptr_t; + +static copy_and_cast_generic_fn_ptr_t + copy_and_cast_generic_dispatch_table[td_ns::num_types][td_ns::num_types]; +static copy_and_cast_1d_fn_ptr_t + copy_and_cast_1d_dispatch_table[td_ns::num_types][td_ns::num_types]; +static copy_and_cast_contig_fn_ptr_t + copy_and_cast_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; + +namespace py = pybind11; + +using dpctl::utils::keep_args_alive; + +std::pair copy_usm_ndarray_into_usm_ndarray( + const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends = {}) +{ + // array dimensions must be the same + int src_nd = src.get_ndim(); + int dst_nd = dst.get_ndim(); + + if (src_nd != dst_nd) { + throw py::value_error("Array dimensions are not the same."); + } + + // shapes must be the same + const py::ssize_t *src_shape = src.get_shape_raw(); + const py::ssize_t *dst_shape = dst.get_shape_raw(); + + bool shapes_equal(true); + std::size_t src_nelems(1); + + for (int i = 0; shapes_equal && (i < src_nd); ++i) { + src_nelems *= static_cast(src_shape[i]); + shapes_equal = shapes_equal && (src_shape[i] == dst_shape[i]); + } + if (!shapes_equal) { + throw py::value_error("Array shapes are not the same."); + } + + if (src_nelems == 0) { + // nothing to do + return std::make_pair(sycl::event(), sycl::event()); + } + + dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(dst, src_nelems); + + // check compatibility of execution queue and allocation queue + if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst); + + int src_typenum = src.get_typenum(); + int dst_typenum = dst.get_typenum(); + + auto array_types = td_ns::usm_ndarray_types(); + int src_type_id = array_types.typenum_to_lookup_id(src_typenum); + int dst_type_id = array_types.typenum_to_lookup_id(dst_typenum); + + char *src_data = src.get_data(); + char *dst_data = dst.get_data(); + + // check that arrays do not overlap, and concurrent copying is safe. + auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); + if (overlap(src, dst)) { + // TODO: could use a temporary, but this is done by the caller + throw py::value_error("Arrays index overlapping segments of memory"); + } + + bool is_src_c_contig = src.is_c_contiguous(); + bool is_src_f_contig = src.is_f_contiguous(); + + bool is_dst_c_contig = dst.is_c_contiguous(); + bool is_dst_f_contig = dst.is_f_contiguous(); + + // check for applicability of special cases: + // (both C-contiguous || both F-contiguous) + bool both_c_contig = (is_src_c_contig && is_dst_c_contig); + bool both_f_contig = (is_src_f_contig && is_dst_f_contig); + if (both_c_contig || both_f_contig) { + + sycl::event copy_ev; + if (src_type_id == dst_type_id) { + + int src_elem_size = src.get_elemsize(); + + copy_ev = exec_q.memcpy(static_cast(dst_data), + static_cast(src_data), + src_nelems * src_elem_size, depends); + } + else { + auto contig_fn = + copy_and_cast_contig_dispatch_table[dst_type_id][src_type_id]; + copy_ev = + contig_fn(exec_q, src_nelems, src_data, dst_data, depends); + } + // make sure src and dst are not GC-ed before copy_ev is complete + return std::make_pair(keep_args_alive(exec_q, {src, dst}, {copy_ev}), + copy_ev); + } + + if ((src_type_id == dst_type_id) && (src_nd > 1)) { + if (is_dst_c_contig) { + return py_as_c_contig(src, dst, exec_q, depends); + } + else if (is_dst_f_contig) { + return py_as_f_contig(src, dst, exec_q, depends); + } + } + + auto const &src_strides = src.get_strides_vector(); + auto const &dst_strides = dst.get_strides_vector(); + + using shT = std::vector; + shT simplified_shape; + shT simplified_src_strides; + shT simplified_dst_strides; + py::ssize_t src_offset(0); + py::ssize_t dst_offset(0); + + int nd = src_nd; + const py::ssize_t *shape = src_shape; + + // nd, simplified_* and *_offset are modified by reference + simplify_iteration_space(nd, shape, src_strides, dst_strides, + // output + simplified_shape, simplified_src_strides, + simplified_dst_strides, src_offset, dst_offset); + + if (nd < 2) { + if (nd == 1) { + std::array shape_arr = {simplified_shape[0]}; + std::array src_strides_arr = { + simplified_src_strides[0]}; + std::array dst_strides_arr = { + simplified_dst_strides[0]}; + + sycl::event copy_and_cast_1d_event; + if ((src_strides_arr[0] == 1) && (dst_strides_arr[0] == 1) && + (src_offset == 0) && (dst_offset == 0)) { + auto contig_fn = + copy_and_cast_contig_dispatch_table[dst_type_id] + [src_type_id]; + copy_and_cast_1d_event = + contig_fn(exec_q, src_nelems, src_data, dst_data, depends); + } + else { + auto fn = + copy_and_cast_1d_dispatch_table[dst_type_id][src_type_id]; + copy_and_cast_1d_event = + fn(exec_q, src_nelems, shape_arr, src_strides_arr, + dst_strides_arr, src_data, src_offset, dst_data, + dst_offset, depends); + } + return std::make_pair( + keep_args_alive(exec_q, {src, dst}, {copy_and_cast_1d_event}), + copy_and_cast_1d_event); + } + else if (nd == 0) { // case of a scalar + assert(src_nelems == 1); + std::array shape_arr = {1}; + std::array src_strides_arr = {1}; + std::array dst_strides_arr = {1}; + + auto fn = copy_and_cast_1d_dispatch_table[dst_type_id][src_type_id]; + + sycl::event copy_and_cast_0d_event = fn( + exec_q, src_nelems, shape_arr, src_strides_arr, dst_strides_arr, + src_data, src_offset, dst_data, dst_offset, depends); + + return std::make_pair( + keep_args_alive(exec_q, {src, dst}, {copy_and_cast_0d_event}), + copy_and_cast_0d_event); + } + } + + // Generic implementation + auto copy_and_cast_fn = + copy_and_cast_generic_dispatch_table[dst_type_id][src_type_id]; + + std::vector host_task_events; + host_task_events.reserve(2); + + using dpctl::tensor::offset_utils::device_allocate_and_pack; + auto ptr_size_event_tuple = device_allocate_and_pack( + exec_q, host_task_events, simplified_shape, simplified_src_strides, + simplified_dst_strides); + auto shape_strides_owner = std::move(std::get<0>(ptr_size_event_tuple)); + const sycl::event ©_shape_ev = std::get<2>(ptr_size_event_tuple); + const py::ssize_t *shape_strides = shape_strides_owner.get(); + + const sycl::event ©_and_cast_generic_ev = copy_and_cast_fn( + exec_q, src_nelems, nd, shape_strides, src_data, src_offset, dst_data, + dst_offset, depends, {copy_shape_ev}); + + // async free of shape_strides temporary + const auto &temporaries_cleanup_ev = + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {copy_and_cast_generic_ev}, shape_strides_owner); + host_task_events.push_back(temporaries_cleanup_ev); + + return std::make_pair(keep_args_alive(exec_q, {src, dst}, host_task_events), + copy_and_cast_generic_ev); +} + +void init_copy_and_cast_usm_to_usm_dispatch_tables(void) +{ + using namespace td_ns; + + using dpctl::tensor::kernels::copy_and_cast::CopyAndCastContigFactory; + DispatchTableBuilder + dtb_contig; + dtb_contig.populate_dispatch_table(copy_and_cast_contig_dispatch_table); + + using dpctl::tensor::kernels::copy_and_cast::CopyAndCastGenericFactory; + DispatchTableBuilder + dtb_generic; + dtb_generic.populate_dispatch_table(copy_and_cast_generic_dispatch_table); + + using dpctl::tensor::kernels::copy_and_cast::CopyAndCast1DFactory; + DispatchTableBuilder + dtb_1d; + dtb_1d.populate_dispatch_table(copy_and_cast_1d_dispatch_table); +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/copy_and_cast_usm_to_usm.hpp b/dpnp/tensor/libtensor/source/copy_and_cast_usm_to_usm.hpp new file mode 100644 index 000000000000..d2e07b08d38f --- /dev/null +++ b/dpnp/tensor/libtensor/source/copy_and_cast_usm_to_usm.hpp @@ -0,0 +1,53 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===----------------------------------------------------------------------===// + +#pragma once +#include +#include +#include + +#include "dpnp4pybind11.hpp" + +namespace dpctl::tensor::py_internal +{ + +extern std::pair copy_usm_ndarray_into_usm_ndarray( + const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends = {}); + +extern void init_copy_and_cast_usm_to_usm_dispatch_tables(); + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/copy_as_contig.cpp b/dpnp/tensor/libtensor/source/copy_as_contig.cpp new file mode 100644 index 000000000000..c1c4b740dfba --- /dev/null +++ b/dpnp/tensor/libtensor/source/copy_as_contig.cpp @@ -0,0 +1,782 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "dpnp4pybind11.hpp" +#include + +#include "kernels/copy_as_contiguous.hpp" +#include "utils/memory_overlap.hpp" +#include "utils/offset_utils.hpp" +#include "utils/output_validation.hpp" +#include "utils/sycl_alloc_utils.hpp" +#include "utils/type_dispatch.hpp" + +#include "copy_as_contig.hpp" +#include "simplify_iteration_space.hpp" + +namespace dpctl::tensor::py_internal +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +using dpctl::tensor::kernels::copy_as_contig:: + as_c_contiguous_1d_batch_of_square_matrices_impl_fn_ptr_t; +using dpctl::tensor::kernels::copy_as_contig:: + as_c_contiguous_array_impl_fn_ptr_t; +using dpctl::tensor::kernels::copy_as_contig:: + as_c_contiguous_nd_batch_of_square_matrices_impl_fn_ptr_t; +using dpctl::utils::keep_args_alive; + +static as_c_contiguous_array_impl_fn_ptr_t + as_c_contig_array_dispatch_vector[td_ns::num_types]; + +static as_c_contiguous_1d_batch_of_square_matrices_impl_fn_ptr_t + as_c_contig_1d_batch_of_square_matrices_dispatch_vector[td_ns::num_types]; + +static as_c_contiguous_nd_batch_of_square_matrices_impl_fn_ptr_t + as_c_contig_nd_batch_of_square_matrices_dispatch_vector[td_ns::num_types]; + +void init_copy_as_contig_dispatch_vectors(void) +{ + + using dpctl::tensor::kernels::copy_as_contig:: + AsCContig1DBatchOfSquareMatricesFactory; + using dpctl::tensor::kernels::copy_as_contig::AsCContigFactory; + using dpctl::tensor::kernels::copy_as_contig:: + AsCContigNDBatchOfSquareMatricesFactory; + using td_ns::DispatchVectorBuilder; + + // Generic to c-contig + DispatchVectorBuilder + dtv_as_c_contig_array; + + dtv_as_c_contig_array.populate_dispatch_vector( + as_c_contig_array_dispatch_vector); + + // 1D batch of square views into F-contig matrices to c-contig array + DispatchVectorBuilder< + as_c_contiguous_1d_batch_of_square_matrices_impl_fn_ptr_t, + AsCContig1DBatchOfSquareMatricesFactory, td_ns::num_types> + dtv_as_c_contig_1d_batch_of_square_matrices; + + dtv_as_c_contig_1d_batch_of_square_matrices.populate_dispatch_vector( + as_c_contig_1d_batch_of_square_matrices_dispatch_vector); + + // ND batch of square views into F-contig matrices to c-contig array + DispatchVectorBuilder< + as_c_contiguous_nd_batch_of_square_matrices_impl_fn_ptr_t, + AsCContigNDBatchOfSquareMatricesFactory, td_ns::num_types> + dtv_as_c_contig_nd_batch_of_square_matrices; + + dtv_as_c_contig_nd_batch_of_square_matrices.populate_dispatch_vector( + as_c_contig_nd_batch_of_square_matrices_dispatch_vector); +} + +namespace +{ + +template +std::size_t get_nelems(const std::vector &shape) +{ + auto mult_fn = [](std::size_t prod, const dimT &term) -> std::size_t { + return prod * static_cast(term); + }; + + static constexpr std::size_t unit{1}; + + const std::size_t nelems = + std::accumulate(std::begin(shape), std::end(shape), unit, mult_fn); + return nelems; +} + +} // end of anonymous namespace + +std::pair + py_as_c_contig_f2c(const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends); + +std::pair + py_as_c_contig(const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends) +{ + /* Same dimensions, same shape, same data-type + * dst is C-contiguous. + */ + const int src_nd = src.get_ndim(); + const int dst_nd = dst.get_ndim(); + + if (src_nd != dst_nd) { + throw py::value_error("Number of dimensions must be the same"); + } + + const auto &src_shape_vec = src.get_shape_vector(); + const auto &dst_shape_vec = dst.get_shape_vector(); + + if (src_shape_vec != dst_shape_vec) { + throw py::value_error("Shapes must be equal"); + } + + int src_typenum = src.get_typenum(); + int dst_typenum = dst.get_typenum(); + + const auto &array_types = td_ns::usm_ndarray_types(); + const int src_type_id = array_types.typenum_to_lookup_id(src_typenum); + const int dst_type_id = array_types.typenum_to_lookup_id(dst_typenum); + + if (src_type_id != dst_type_id) { + throw py::value_error( + "Source and destination arrays must have the same data type"); + } + + // ensures also that destination is plenty ample to accommodate all + // elements of src array + if (!dst.is_c_contiguous()) { + throw py::value_error("Destination array must be C-contiguous"); + } + + dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst); + + // check compatibility of execution queue and allocation queue + if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + // check that arrays do not overlap, and concurrent copying is safe. + auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); + if (overlap(src, dst)) { + throw py::value_error("Arrays index overlapping segments of memory"); + } + + const auto &src_strides_vec = src.get_strides_vector(); + + if (src_nd >= 2) { + auto n = dst_shape_vec.back(); + if (n == dst_shape_vec[src_nd - 2]) { + static constexpr auto unit_stride = py::ssize_t(1); + if (src_strides_vec[src_nd - 2] == unit_stride) { + return py_as_c_contig_f2c(src, dst, exec_q, depends); + } + } + } + + const std::size_t nelems = get_nelems(src_shape_vec); + + if (nelems == 0) { + // nothing to do + return std::make_pair(sycl::event(), sycl::event()); + } + + // simplify iteration space + using shT = std::vector; + shT simplified_shape; + shT simplified_src_strides; + shT simplified_dst_strides; + py::ssize_t src_offset(0); + py::ssize_t dst_offset(0); + + int nd = src_nd; + + // nd, simplified_* and *_offset are modified by reference + simplify_iteration_space(nd, src_shape_vec.data(), src_strides_vec, + dst.get_strides_vector(), + // output + simplified_shape, simplified_src_strides, + simplified_dst_strides, src_offset, dst_offset); + + if (!((0 == src_offset) && (0 == dst_offset))) { + throw std::runtime_error( + "Unexpected result of simplifying iteration space, 1"); + } + + std::vector host_task_events{}; + auto ptr_size_event_tuple = + dpctl::tensor::offset_utils::device_allocate_and_pack( + exec_q, host_task_events, simplified_shape, simplified_src_strides); + auto shape_stride_owner = std::move(std::get<0>(ptr_size_event_tuple)); + const sycl::event ©_shape_ev = std::get<2>(ptr_size_event_tuple); + const py::ssize_t *shape_stride = shape_stride_owner.get(); + + auto ascontig_fn = as_c_contig_array_dispatch_vector[src_type_id]; + + std::vector all_depends; + all_depends.reserve(depends.size() + 1); + all_depends.insert(std::end(all_depends), std::begin(depends), + std::end(depends)); + all_depends.push_back(copy_shape_ev); + + sycl::event ascontig_ev = + ascontig_fn(exec_q, nelems, nd, shape_stride, src.get_data(), + dst.get_data(), all_depends); + + const auto &temporaries_cleanup_ev = + dpctl::tensor::alloc_utils::async_smart_free(exec_q, {ascontig_ev}, + shape_stride_owner); + host_task_events.push_back(temporaries_cleanup_ev); + + return std::make_pair(keep_args_alive(exec_q, {src, dst}, host_task_events), + ascontig_ev); +} + +std::pair + py_as_f_contig_c2f(const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends); + +std::pair + py_as_f_contig(const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends) +{ + /* Same dimensions, same shape, same data-type + * dst is F-contiguous. + */ + int src_nd = src.get_ndim(); + int dst_nd = dst.get_ndim(); + + if (src_nd != dst_nd) { + throw py::value_error("Number of dimensions must be the same"); + } + + const auto &src_shape_vec = src.get_shape_vector(); + const auto &dst_shape_vec = dst.get_shape_vector(); + + if (src_shape_vec != dst_shape_vec) { + throw py::value_error("Shapes must be equal"); + } + + int src_typenum = src.get_typenum(); + int dst_typenum = dst.get_typenum(); + + const auto &array_types = td_ns::usm_ndarray_types(); + const int src_type_id = array_types.typenum_to_lookup_id(src_typenum); + const int dst_type_id = array_types.typenum_to_lookup_id(dst_typenum); + + if (src_type_id != dst_type_id) { + throw py::value_error( + "Source and destination arrays must have the same data type"); + } + + // ensures also that destination is plenty ample to accommodate all + // elements of src array + if (!dst.is_f_contiguous()) { + throw py::value_error("Destination array must be F-contiguous"); + } + + dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst); + + // check compatibility of execution queue and allocation queue + if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + // check that arrays do not overlap, and concurrent copying is safe. + auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); + if (overlap(src, dst)) { + throw py::value_error("Arrays index overlapping segments of memory"); + } + + const auto &src_strides_vec = src.get_strides_vector(); + + if (src_nd >= 2) { + auto n = dst_shape_vec.front(); + if (n == dst_shape_vec[1]) { + static constexpr auto unit_stride = py::ssize_t(1); + if (src_strides_vec[1] == unit_stride) { + return py_as_f_contig_c2f(src, dst, exec_q, depends); + } + } + } + + const std::size_t nelems = get_nelems(src_shape_vec); + + if (nelems == 0) { + // nothing to do + return std::make_pair(sycl::event(), sycl::event()); + } + + // simplify batch iteration space + // NB: simplification reverses dst strides to C contig, + // it also reverses simplified_shape and simplified_src_strides + + using shT = std::vector; + shT simplified_shape; + shT simplified_src_strides; + shT simplified_dst_strides; + py::ssize_t src_offset(0); + py::ssize_t dst_offset(0); + + int nd = src_nd; + + // nd, simplified_* and *_offset are modified by reference + simplify_iteration_space(nd, src_shape_vec.data(), src_strides_vec, + dst.get_strides_vector(), + // output + simplified_shape, simplified_src_strides, + simplified_dst_strides, src_offset, dst_offset); + + if (!((0 == src_offset) && (0 == dst_offset))) { + throw std::runtime_error( + "Unexpected result of simplifying iteration space, 1"); + } + + std::vector host_task_events{}; + auto ptr_size_event_tuple = + dpctl::tensor::offset_utils::device_allocate_and_pack( + exec_q, host_task_events, simplified_shape, simplified_src_strides); + auto shape_stride_owner = std::move(std::get<0>(ptr_size_event_tuple)); + const sycl::event ©_shape_ev = std::get<2>(ptr_size_event_tuple); + const py::ssize_t *shape_stride = shape_stride_owner.get(); + + auto ascontig_fn = as_c_contig_array_dispatch_vector[src_type_id]; + + std::vector all_depends; + all_depends.reserve(depends.size() + 1); + all_depends.insert(std::end(all_depends), std::begin(depends), + std::end(depends)); + all_depends.push_back(copy_shape_ev); + + sycl::event ascontig_ev = + ascontig_fn(exec_q, nelems, nd, shape_stride, src.get_data(), + dst.get_data(), all_depends); + + const auto &temporaries_cleanup_ev = + dpctl::tensor::alloc_utils::async_smart_free(exec_q, {ascontig_ev}, + shape_stride_owner); + host_task_events.push_back(temporaries_cleanup_ev); + + return std::make_pair(keep_args_alive(exec_q, {src, dst}, host_task_events), + ascontig_ev); +} + +std::pair + py_as_c_contig_f2c(const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends) +{ + /* Same dimensions, same shape, same data-type + * dst is C-contiguous. + */ + int src_nd = src.get_ndim(); + int dst_nd = dst.get_ndim(); + + if (src_nd != dst_nd) { + throw py::value_error("Number of dimensions must be the same."); + } + if (src_nd < 2) { + throw py::value_error("Arrays must have 2 or more axes"); + } + + const auto &src_shape_vec = src.get_shape_vector(); + const auto &dst_shape_vec = dst.get_shape_vector(); + + std::size_t nelems{1}; + bool equal_shapes = true; + + for (int i = 0; equal_shapes && (i < src_nd); ++i) { + auto sh_i = src_shape_vec[i]; + equal_shapes = equal_shapes && (sh_i == dst_shape_vec[i]); + nelems *= static_cast(sh_i); + } + + if (!equal_shapes) { + throw py::value_error("Shapes must be equal"); + } + + const auto n = src_shape_vec.back(); + if (src_shape_vec[src_nd - 2] != n) { + throw py::value_error("Matrices must be square"); + } + + const auto &src_strides_vec = src.get_strides_vector(); + + if (src_strides_vec[src_nd - 2] != py::ssize_t(1)) { + throw py::value_error("Unexpected destination array layout"); + } + + int src_typenum = src.get_typenum(); + int dst_typenum = dst.get_typenum(); + + auto array_types = td_ns::usm_ndarray_types(); + const int src_type_id = array_types.typenum_to_lookup_id(src_typenum); + const int dst_type_id = array_types.typenum_to_lookup_id(dst_typenum); + + if (src_type_id != dst_type_id) { + throw py::value_error( + "Source and destination arrays must have the same data type"); + } + + // ensures also that destination is plenty ample to accommodate all + // elements of src array + if (!dst.is_c_contiguous()) { + throw py::value_error("Destination array must be C-contiguous"); + } + + dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst); + + // check compatibility of execution queue and allocation queue + if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + // check that arrays do not overlap, and concurrent copying is safe. + auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); + if (overlap(src, dst)) { + throw py::value_error("Arrays index overlapping segments of memory"); + } + + if (nelems == 0) { + // nothing to do + return std::make_pair(sycl::event(), sycl::event()); + } + + const auto &dst_strides_vec = dst.get_strides_vector(); + + const std::size_t batch_nelems = + (src_nd == 2) ? std::size_t(1) : (nelems / (n * n)); + const py::ssize_t dst_batch_step = + (src_nd == 2) ? py::ssize_t(0) : dst_strides_vec[src_nd - 3]; + + std::vector src_batch_strides_vec; + std::vector dst_batch_strides_vec; + std::vector batch_shape_vec; + + if (src_nd == 2) { + batch_shape_vec.push_back(py::ssize_t(1)); + src_batch_strides_vec.push_back(py::ssize_t(0)); + dst_batch_strides_vec.push_back(dst_batch_step); + } + else { + batch_shape_vec.insert(std::end(batch_shape_vec), + std::begin(src_shape_vec), + std::end(src_shape_vec) - 2); + src_batch_strides_vec.insert(std::end(src_batch_strides_vec), + std::begin(src_strides_vec), + std::end(src_strides_vec) - 2); + dst_batch_strides_vec.insert(std::end(dst_batch_strides_vec), + std::begin(dst_strides_vec), + std::end(dst_strides_vec) - 2); + } + + // simplify batch iteration space + using shT = std::vector; + shT simplified_shape; + shT simplified_src_strides; + shT simplified_dst_strides; + py::ssize_t src_offset(0); + py::ssize_t dst_offset(0); + + int nd = static_cast(batch_shape_vec.size()); + + // nd, simplified_* and *_offset are modified by reference + simplify_iteration_space(nd, batch_shape_vec.data(), src_batch_strides_vec, + dst_batch_strides_vec, + // output + simplified_shape, simplified_src_strides, + simplified_dst_strides, src_offset, dst_offset); + + if (!((0 == src_offset) && (0 == dst_offset))) { + throw std::runtime_error( + "Unexpected result of simplifying iteration space, 1"); + } + + if (1 == nd) { + const auto expected_dim = static_cast(batch_nelems); + if ((simplified_shape.front() != expected_dim) || + (simplified_dst_strides.front() != dst_batch_step)) { + throw std::runtime_error( + "Unexpected result of simplifying iteration space, 2"); + } + + auto impl_fn = as_c_contig_1d_batch_of_square_matrices_dispatch_vector + [src_type_id]; + const py::ssize_t src_batch_step = simplified_src_strides.front(); + + sycl::event ascontig_ev = + impl_fn(exec_q, batch_nelems, src_batch_step, dst_batch_step, n, + src.get_data(), src_strides_vec.back(), dst.get_data(), + dst_strides_vec[src_nd - 2], depends); + + return std::make_pair( + keep_args_alive(exec_q, {src, dst}, {ascontig_ev}), ascontig_ev); + } + + auto impl_fn = + as_c_contig_nd_batch_of_square_matrices_dispatch_vector[src_type_id]; + + std::vector host_task_events; + host_task_events.reserve(2); + + using dpctl::tensor::offset_utils::device_allocate_and_pack; + auto ptr_size_event_tuple = device_allocate_and_pack( + exec_q, host_task_events, simplified_shape, simplified_src_strides); + auto packed_shape_strides_owner = + std::move(std::get<0>(ptr_size_event_tuple)); + const sycl::event ©_shape_ev = std::get<2>(ptr_size_event_tuple); + const py::ssize_t *packed_shape_strides = packed_shape_strides_owner.get(); + + std::vector all_depends; + all_depends.reserve(depends.size() + 1); + all_depends.insert(std::end(all_depends), std::begin(depends), + std::end(depends)); + all_depends.push_back(copy_shape_ev); + + sycl::event ascontig_ev = + impl_fn(exec_q, batch_nelems, nd, packed_shape_strides, dst_batch_step, + n, src.get_data(), src_strides_vec.back(), dst.get_data(), + dst_strides_vec[src_nd - 2], all_depends); + + // async free of shape_strides temporary + sycl::event temporaries_cleanup_ev = + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {ascontig_ev}, packed_shape_strides_owner); + host_task_events.push_back(temporaries_cleanup_ev); + + return std::make_pair(keep_args_alive(exec_q, {src, dst}, host_task_events), + ascontig_ev); +} + +std::pair + py_as_f_contig_c2f(const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends) +{ + /* Same dimensions, same shape, same data-type + * dst is F-contiguous. + */ + int src_nd = src.get_ndim(); + int dst_nd = dst.get_ndim(); + + if (src_nd != dst_nd) { + throw py::value_error("Number of dimensions must be the same."); + } + if (src_nd < 2) { + throw py::value_error("Arrays must have 2 or more axes"); + } + + // ensures also that destination is plenty ample to accommodate all + // elements of src array + if (!dst.is_f_contiguous()) { + throw py::value_error("Destination array must be C-contiguous"); + } + + const auto &src_shape_vec = src.get_shape_vector(); + const auto &dst_shape_vec = dst.get_shape_vector(); + + std::size_t nelems{1}; + bool equal_shapes = true; + + for (int i = 0; equal_shapes && (i < src_nd); ++i) { + auto sh_i = src_shape_vec[i]; + equal_shapes = equal_shapes && (sh_i == dst_shape_vec[i]); + nelems *= static_cast(sh_i); + } + + if (!equal_shapes) { + throw py::value_error("Shapes must be equal"); + } + + const auto n = dst_shape_vec.front(); + if (dst_shape_vec[1] != n) { + throw py::value_error("Matrices must be square"); + } + + const auto &src_strides_vec = src.get_strides_vector(); + + if (src_strides_vec[1] != py::ssize_t(1)) { + throw py::value_error("Unexpected destination array layout"); + } + + dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst); + + // check compatibility of execution queue and allocation queue + if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + // check that arrays do not overlap, and concurrent copying is safe. + auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); + if (overlap(src, dst)) { + throw py::value_error("Arrays index overlapping segments of memory"); + } + + int src_typenum = src.get_typenum(); + int dst_typenum = dst.get_typenum(); + + auto array_types = td_ns::usm_ndarray_types(); + const int src_type_id = array_types.typenum_to_lookup_id(src_typenum); + const int dst_type_id = array_types.typenum_to_lookup_id(dst_typenum); + + if (src_type_id != dst_type_id) { + throw py::value_error( + "Source and destination arrays must have the same data type"); + } + + if (nelems == 0) { + // nothing to do + return std::make_pair(sycl::event(), sycl::event()); + } + + const auto &dst_strides_vec = dst.get_strides_vector(); + + const std::size_t batch_nelems = + (src_nd == 2) ? std::size_t(1) : (nelems / (n * n)); + const py::ssize_t dst_batch_step = + (src_nd == 2) ? py::ssize_t(0) : dst_strides_vec[2]; + + std::vector src_batch_strides_vec; + std::vector dst_batch_strides_vec; + std::vector batch_shape_vec; + + if (src_nd == 2) { + batch_shape_vec.push_back(py::ssize_t(1)); + src_batch_strides_vec.push_back(py::ssize_t(0)); + dst_batch_strides_vec.push_back(dst_batch_step); + } + else { + batch_shape_vec.insert(std::end(batch_shape_vec), + std::begin(src_shape_vec) + 2, + std::end(src_shape_vec)); + src_batch_strides_vec.insert(std::end(src_batch_strides_vec), + std::begin(src_strides_vec) + 2, + std::end(src_strides_vec)); + dst_batch_strides_vec.insert(std::end(dst_batch_strides_vec), + std::begin(dst_strides_vec) + 2, + std::end(dst_strides_vec)); + } + + // simplify batch iteration space + // NB: simplification reverses dst strides to C contig, + // it also reverses simplified_shape and simplified_src_strides + using shT = std::vector; + shT simplified_shape; + shT simplified_src_strides; + shT simplified_dst_strides; + py::ssize_t src_offset(0); + py::ssize_t dst_offset(0); + + int nd = static_cast(batch_shape_vec.size()); + + // nd, simplified_* and *_offset are modified by reference + simplify_iteration_space(nd, batch_shape_vec.data(), src_batch_strides_vec, + dst_batch_strides_vec, + // output + simplified_shape, simplified_src_strides, + simplified_dst_strides, src_offset, dst_offset); + + if (!((0 == src_offset) && (0 == dst_offset))) { + throw std::runtime_error( + "Unexpected result of simplifying iteration space, 1"); + } + + if (1 == nd) { + const auto expected_dim = static_cast(batch_nelems); + if ((simplified_shape.front() != expected_dim) || + (simplified_dst_strides.front() != dst_batch_step)) { + throw std::runtime_error( + "Unexpected result of simplifying iteration space, 2"); + } + + auto impl_fn = as_c_contig_1d_batch_of_square_matrices_dispatch_vector + [src_type_id]; + const py::ssize_t src_batch_step = simplified_src_strides.front(); + + sycl::event ascontig_ev = + impl_fn(exec_q, batch_nelems, src_batch_step, dst_batch_step, n, + src.get_data(), src_strides_vec.front(), dst.get_data(), + dst_strides_vec[1], depends); + + return std::make_pair( + keep_args_alive(exec_q, {src, dst}, {ascontig_ev}), ascontig_ev); + } + + auto impl_fn = + as_c_contig_nd_batch_of_square_matrices_dispatch_vector[src_type_id]; + + std::vector host_task_events; + host_task_events.reserve(2); + + using dpctl::tensor::offset_utils::device_allocate_and_pack; + auto ptr_size_event_tuple = device_allocate_and_pack( + exec_q, host_task_events, simplified_shape, simplified_src_strides); + auto packed_shape_strides_owner = + std::move(std::get<0>(ptr_size_event_tuple)); + const sycl::event ©_shape_ev = std::get<2>(ptr_size_event_tuple); + const py::ssize_t *packed_shape_strides = packed_shape_strides_owner.get(); + + std::vector all_depends; + all_depends.reserve(depends.size() + 1); + all_depends.insert(std::end(all_depends), std::begin(depends), + std::end(depends)); + all_depends.push_back(copy_shape_ev); + + sycl::event ascontig_ev = + impl_fn(exec_q, batch_nelems, nd, packed_shape_strides, dst_batch_step, + n, src.get_data(), src_strides_vec.front(), dst.get_data(), + dst_strides_vec[1], all_depends); + + // async free of shape_strides + sycl::event temporaries_cleanup_ev = + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {ascontig_ev}, packed_shape_strides_owner); + host_task_events.push_back(temporaries_cleanup_ev); + + return std::make_pair(keep_args_alive(exec_q, {src, dst}, host_task_events), + ascontig_ev); +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/copy_as_contig.hpp b/dpnp/tensor/libtensor/source/copy_as_contig.hpp new file mode 100644 index 000000000000..bfe3159c8813 --- /dev/null +++ b/dpnp/tensor/libtensor/source/copy_as_contig.hpp @@ -0,0 +1,54 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** + +#pragma once + +#include +#include + +#include "dpnp4pybind11.hpp" +#include + +namespace dpctl::tensor::py_internal +{ + +std::pair + py_as_c_contig(const dpctl::tensor::usm_ndarray &, + const dpctl::tensor::usm_ndarray &, + sycl::queue &, + const std::vector &); + +std::pair + py_as_f_contig(const dpctl::tensor::usm_ndarray &, + const dpctl::tensor::usm_ndarray &, + sycl::queue &, + const std::vector &); + +void init_copy_as_contig_dispatch_vectors(void); + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/copy_for_reshape.cpp b/dpnp/tensor/libtensor/source/copy_for_reshape.cpp new file mode 100644 index 000000000000..524bfcfdb98b --- /dev/null +++ b/dpnp/tensor/libtensor/source/copy_for_reshape.cpp @@ -0,0 +1,184 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include + +#include + +#include "dpnp4pybind11.hpp" +#include + +#include "copy_for_reshape.hpp" +#include "kernels/copy_and_cast.hpp" +#include "utils/offset_utils.hpp" +#include "utils/output_validation.hpp" +#include "utils/sycl_alloc_utils.hpp" +#include "utils/type_dispatch.hpp" + +namespace dpctl::tensor::py_internal +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +using dpctl::tensor::kernels::copy_and_cast::copy_for_reshape_fn_ptr_t; +using dpctl::utils::keep_args_alive; + +// define static vector +static copy_for_reshape_fn_ptr_t + copy_for_reshape_generic_dispatch_vector[td_ns::num_types]; + +/* + * Copies src into dst (same data type) of different shapes by using flat + * iterations. + * + * Equivalent to the following loop: + * + * for i for range(src.size): + * dst[np.multi_index(i, dst.shape)] = src[np.multi_index(i, src.shape)] + */ +std::pair + copy_usm_ndarray_for_reshape(const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends) +{ + py::ssize_t src_nelems = src.get_size(); + py::ssize_t dst_nelems = dst.get_size(); + + // Must have the same number of elements + if (src_nelems != dst_nelems) { + throw py::value_error( + "copy_usm_ndarray_for_reshape requires src and dst to " + "have the same number of elements."); + } + + int src_typenum = src.get_typenum(); + int dst_typenum = dst.get_typenum(); + + // typenames must be the same + if (src_typenum != dst_typenum) { + throw py::value_error( + "copy_usm_ndarray_for_reshape requires src and dst to " + "have the same type."); + } + + if (src_nelems == 0) { + return std::make_pair(sycl::event(), sycl::event()); + } + + dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(dst, src_nelems); + + // check same contexts + if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst); + + if (src_nelems == 1) { + // handle special case of 1-element array + int src_elemsize = src.get_elemsize(); + const char *src_data = src.get_data(); + char *dst_data = dst.get_data(); + sycl::event copy_ev = + exec_q.copy(src_data, dst_data, src_elemsize, depends); + return std::make_pair(keep_args_alive(exec_q, {src, dst}, {copy_ev}), + copy_ev); + } + + // dimensions may be different + int src_nd = src.get_ndim(); + int dst_nd = dst.get_ndim(); + + auto array_types = td_ns::usm_ndarray_types(); + int type_id = array_types.typenum_to_lookup_id(src_typenum); + + auto fn = copy_for_reshape_generic_dispatch_vector[type_id]; + + auto src_shape = src.get_shape_vector(); + auto src_strides = src.get_strides_vector(); + + auto dst_shape = dst.get_shape_vector(); + auto dst_strides = dst.get_strides_vector(); + + std::vector host_task_events; + host_task_events.reserve(2); + + // shape_strides = [src_shape, src_strides, dst_shape, dst_strides] + using dpctl::tensor::offset_utils::device_allocate_and_pack; + auto ptr_size_event_tuple = device_allocate_and_pack( + exec_q, host_task_events, src_shape, src_strides, dst_shape, + dst_strides); + auto copy_shape_ev = std::get<2>(ptr_size_event_tuple); + auto shape_strides_owner = std::move(std::get<0>(ptr_size_event_tuple)); + const py::ssize_t *shape_strides = shape_strides_owner.get(); + + const char *src_data = src.get_data(); + char *dst_data = dst.get_data(); + + std::vector all_deps(depends.size() + 1); + all_deps.push_back(copy_shape_ev); + all_deps.insert(std::end(all_deps), std::begin(depends), std::end(depends)); + + sycl::event copy_for_reshape_event = + fn(exec_q, src_nelems, src_nd, dst_nd, shape_strides, src_data, + dst_data, all_deps); + + sycl::event temporaries_cleanup_ev = + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {copy_for_reshape_event}, shape_strides_owner); + + host_task_events.push_back(temporaries_cleanup_ev); + + return std::make_pair(keep_args_alive(exec_q, {src, dst}, host_task_events), + copy_for_reshape_event); +} + +void init_copy_for_reshape_dispatch_vectors(void) +{ + using namespace td_ns; + using dpctl::tensor::kernels::copy_and_cast::CopyForReshapeGenericFactory; + + DispatchVectorBuilder + dvb; + dvb.populate_dispatch_vector(copy_for_reshape_generic_dispatch_vector); +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/copy_for_reshape.hpp b/dpnp/tensor/libtensor/source/copy_for_reshape.hpp new file mode 100644 index 000000000000..c5af885ad6cd --- /dev/null +++ b/dpnp/tensor/libtensor/source/copy_for_reshape.hpp @@ -0,0 +1,54 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===----------------------------------------------------------------------===// + +#pragma once +#include +#include + +#include + +#include "dpnp4pybind11.hpp" + +namespace dpctl::tensor::py_internal +{ + +extern std::pair + copy_usm_ndarray_for_reshape(const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends = {}); + +extern void init_copy_for_reshape_dispatch_vectors(); + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/copy_for_roll.cpp b/dpnp/tensor/libtensor/source/copy_for_roll.cpp new file mode 100644 index 000000000000..7742c1c96a4e --- /dev/null +++ b/dpnp/tensor/libtensor/source/copy_for_roll.cpp @@ -0,0 +1,399 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include +#include + +#include + +#include "dpnp4pybind11.hpp" +#include + +#include "copy_for_roll.hpp" +#include "kernels/copy_and_cast.hpp" +#include "utils/offset_utils.hpp" +#include "utils/output_validation.hpp" +#include "utils/sycl_alloc_utils.hpp" +#include "utils/type_dispatch.hpp" + +#include "simplify_iteration_space.hpp" + +namespace dpctl::tensor::py_internal +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +using dpctl::tensor::kernels::copy_and_cast::copy_for_roll_contig_fn_ptr_t; +using dpctl::tensor::kernels::copy_and_cast:: + copy_for_roll_ndshift_strided_fn_ptr_t; +using dpctl::tensor::kernels::copy_and_cast::copy_for_roll_strided_fn_ptr_t; +using dpctl::utils::keep_args_alive; + +// define static vector +static copy_for_roll_strided_fn_ptr_t + copy_for_roll_strided_dispatch_vector[td_ns::num_types]; + +static copy_for_roll_contig_fn_ptr_t + copy_for_roll_contig_dispatch_vector[td_ns::num_types]; + +static copy_for_roll_ndshift_strided_fn_ptr_t + copy_for_roll_ndshift_dispatch_vector[td_ns::num_types]; + +/* + * Copies src into dst (same data type) of different shapes by using flat + * iterations. + * + * Equivalent to the following loop: + * + * for i for range(src.size): + * dst[np.multi_index(i, dst.shape)] = src[np.multi_index(i, src.shape)] + */ +std::pair + copy_usm_ndarray_for_roll_1d(const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &dst, + py::ssize_t shift, + sycl::queue &exec_q, + const std::vector &depends) +{ + int src_nd = src.get_ndim(); + int dst_nd = dst.get_ndim(); + + // Must have the same number of dimensions + if (src_nd != dst_nd) { + throw py::value_error( + "copy_usm_ndarray_for_roll_1d requires src and dst to " + "have the same number of dimensions."); + } + + const py::ssize_t *src_shape_ptr = src.get_shape_raw(); + const py::ssize_t *dst_shape_ptr = dst.get_shape_raw(); + + if (!std::equal(src_shape_ptr, src_shape_ptr + src_nd, dst_shape_ptr)) { + throw py::value_error( + "copy_usm_ndarray_for_roll_1d requires src and dst to " + "have the same shape."); + } + + py::ssize_t src_nelems = src.get_size(); + + int src_typenum = src.get_typenum(); + int dst_typenum = dst.get_typenum(); + + // typenames must be the same + if (src_typenum != dst_typenum) { + throw py::value_error( + "copy_usm_ndarray_for_roll_1d requires src and dst to " + "have the same type."); + } + + if (src_nelems == 0) { + return std::make_pair(sycl::event(), sycl::event()); + } + + dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(dst, src_nelems); + + // check same contexts + if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst); + + if (src_nelems == 1) { + // handle special case of 1-element array + int src_elemsize = src.get_elemsize(); + const char *src_data = src.get_data(); + char *dst_data = dst.get_data(); + sycl::event copy_ev = + exec_q.copy(src_data, dst_data, src_elemsize, depends); + return std::make_pair(keep_args_alive(exec_q, {src, dst}, {copy_ev}), + copy_ev); + } + + auto array_types = td_ns::usm_ndarray_types(); + int type_id = array_types.typenum_to_lookup_id(src_typenum); + + const bool is_src_c_contig = src.is_c_contiguous(); + const bool is_src_f_contig = src.is_f_contiguous(); + + const bool is_dst_c_contig = dst.is_c_contiguous(); + const bool is_dst_f_contig = dst.is_f_contiguous(); + + const bool both_c_contig = is_src_c_contig && is_dst_c_contig; + const bool both_f_contig = is_src_f_contig && is_dst_f_contig; + + // normalize shift parameter to be 0 <= offset < src_nelems + std::size_t offset = + (shift > 0) ? (shift % src_nelems) : src_nelems + (shift % src_nelems); + + const char *src_data = src.get_data(); + char *dst_data = dst.get_data(); + + if (both_c_contig || both_f_contig) { + auto fn = copy_for_roll_contig_dispatch_vector[type_id]; + + if (fn != nullptr) { + static constexpr py::ssize_t zero_offset = 0; + + sycl::event copy_for_roll_ev = + fn(exec_q, offset, src_nelems, src_data, zero_offset, dst_data, + zero_offset, depends); + + sycl::event ht_ev = + keep_args_alive(exec_q, {src, dst}, {copy_for_roll_ev}); + + return std::make_pair(ht_ev, copy_for_roll_ev); + } + } + + auto const &src_strides = src.get_strides_vector(); + auto const &dst_strides = dst.get_strides_vector(); + + using shT = std::vector; + shT simplified_shape; + shT simplified_src_strides; + shT simplified_dst_strides; + py::ssize_t src_offset(0); + py::ssize_t dst_offset(0); + + int nd = src_nd; + const py::ssize_t *shape = src_shape_ptr; + + // nd, simplified_* and *_offset are modified by reference + simplify_iteration_space(nd, shape, src_strides, dst_strides, + // output + simplified_shape, simplified_src_strides, + simplified_dst_strides, src_offset, dst_offset); + + if (nd == 1 && simplified_src_strides[0] == 1 && + simplified_dst_strides[0] == 1) { + auto fn = copy_for_roll_contig_dispatch_vector[type_id]; + + if (fn != nullptr) { + + sycl::event copy_for_roll_ev = + fn(exec_q, offset, src_nelems, src_data, src_offset, dst_data, + dst_offset, depends); + + sycl::event ht_ev = + keep_args_alive(exec_q, {src, dst}, {copy_for_roll_ev}); + + return std::make_pair(ht_ev, copy_for_roll_ev); + } + } + + auto fn = copy_for_roll_strided_dispatch_vector[type_id]; + + std::vector host_task_events; + host_task_events.reserve(2); + + // shape_strides = [src_shape, src_strides, dst_strides] + using dpctl::tensor::offset_utils::device_allocate_and_pack; + auto ptr_size_event_tuple = device_allocate_and_pack( + exec_q, host_task_events, simplified_shape, simplified_src_strides, + simplified_dst_strides); + auto shape_strides_owner = std::move(std::get<0>(ptr_size_event_tuple)); + sycl::event copy_shape_ev = std::get<2>(ptr_size_event_tuple); + const py::ssize_t *shape_strides = shape_strides_owner.get(); + + std::vector all_deps(depends.size() + 1); + all_deps.push_back(copy_shape_ev); + all_deps.insert(std::end(all_deps), std::begin(depends), std::end(depends)); + + sycl::event copy_for_roll_event = + fn(exec_q, offset, src_nelems, src_nd, shape_strides, src_data, + src_offset, dst_data, dst_offset, all_deps); + + sycl::event temporaries_cleanup_ev = + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {copy_for_roll_event}, shape_strides_owner); + host_task_events.push_back(temporaries_cleanup_ev); + + return std::make_pair(keep_args_alive(exec_q, {src, dst}, host_task_events), + copy_for_roll_event); +} + +std::pair + copy_usm_ndarray_for_roll_nd(const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &dst, + const std::vector &shifts, + sycl::queue &exec_q, + const std::vector &depends) +{ + int src_nd = src.get_ndim(); + int dst_nd = dst.get_ndim(); + + // Must have the same number of dimensions + if (src_nd != dst_nd) { + throw py::value_error( + "copy_usm_ndarray_for_roll_nd requires src and dst to " + "have the same number of dimensions."); + } + + if (static_cast(src_nd) != shifts.size()) { + throw py::value_error( + "copy_usm_ndarray_for_roll_nd requires shifts to " + "contain an integral shift for each array dimension."); + } + + const py::ssize_t *src_shape_ptr = src.get_shape_raw(); + const py::ssize_t *dst_shape_ptr = dst.get_shape_raw(); + + if (!std::equal(src_shape_ptr, src_shape_ptr + src_nd, dst_shape_ptr)) { + throw py::value_error( + "copy_usm_ndarray_for_roll_nd requires src and dst to " + "have the same shape."); + } + + py::ssize_t src_nelems = src.get_size(); + + int src_typenum = src.get_typenum(); + int dst_typenum = dst.get_typenum(); + + // typenames must be the same + if (src_typenum != dst_typenum) { + throw py::value_error( + "copy_usm_ndarray_for_roll_nd requires src and dst to " + "have the same type."); + } + + if (src_nelems == 0) { + return std::make_pair(sycl::event(), sycl::event()); + } + + dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(dst, src_nelems); + + // check for compatible queues + if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + if (src_nelems == 1) { + // handle special case of 1-element array + int src_elemsize = src.get_elemsize(); + const char *src_data = src.get_data(); + char *dst_data = dst.get_data(); + sycl::event copy_ev = + exec_q.copy(src_data, dst_data, src_elemsize, depends); + return std::make_pair(keep_args_alive(exec_q, {src, dst}, {copy_ev}), + copy_ev); + } + + auto array_types = td_ns::usm_ndarray_types(); + int type_id = array_types.typenum_to_lookup_id(src_typenum); + + std::vector normalized_shifts{}; + normalized_shifts.reserve(src_nd); + + for (int i = 0; i < src_nd; ++i) { + // normalize shift parameter to be 0 <= offset < dim + py::ssize_t dim = src_shape_ptr[i]; + std::size_t offset = + (shifts[i] >= 0) ? (shifts[i] % dim) : dim + (shifts[i] % dim); + + normalized_shifts.push_back(offset); + } + + const char *src_data = src.get_data(); + char *dst_data = dst.get_data(); + + auto const &src_strides = src.get_strides_vector(); + auto const &dst_strides = dst.get_strides_vector(); + auto const &common_shape = src.get_shape_vector(); + + static constexpr py::ssize_t src_offset = 0; + static constexpr py::ssize_t dst_offset = 0; + + auto fn = copy_for_roll_ndshift_dispatch_vector[type_id]; + + std::vector host_task_events; + host_task_events.reserve(2); + + // shape_strides = [src_shape, src_strides, dst_strides] + using dpctl::tensor::offset_utils::device_allocate_and_pack; + auto ptr_size_event_tuple = device_allocate_and_pack( + exec_q, host_task_events, common_shape, src_strides, dst_strides, + normalized_shifts); + auto shape_strides_shifts_owner = + std::move(std::get<0>(ptr_size_event_tuple)); + sycl::event copy_shape_ev = std::get<2>(ptr_size_event_tuple); + const py::ssize_t *shape_strides_shifts = shape_strides_shifts_owner.get(); + + std::vector all_deps(depends.size() + 1); + all_deps.push_back(copy_shape_ev); + all_deps.insert(std::end(all_deps), std::begin(depends), std::end(depends)); + + sycl::event copy_for_roll_event = + fn(exec_q, src_nelems, src_nd, shape_strides_shifts, src_data, + src_offset, dst_data, dst_offset, all_deps); + + auto temporaries_cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {copy_for_roll_event}, shape_strides_shifts_owner); + host_task_events.push_back(temporaries_cleanup_ev); + + return std::make_pair(keep_args_alive(exec_q, {src, dst}, host_task_events), + copy_for_roll_event); +} + +void init_copy_for_roll_dispatch_vectors(void) +{ + using namespace td_ns; + using dpctl::tensor::kernels::copy_and_cast::CopyForRollStridedFactory; + + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(copy_for_roll_strided_dispatch_vector); + + using dpctl::tensor::kernels::copy_and_cast::CopyForRollContigFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(copy_for_roll_contig_dispatch_vector); + + using dpctl::tensor::kernels::copy_and_cast::CopyForRollNDShiftFactory; + DispatchVectorBuilder + dvb3; + dvb3.populate_dispatch_vector(copy_for_roll_ndshift_dispatch_vector); +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/copy_for_roll.hpp b/dpnp/tensor/libtensor/source/copy_for_roll.hpp new file mode 100644 index 000000000000..cffbf9f6f0d6 --- /dev/null +++ b/dpnp/tensor/libtensor/source/copy_for_roll.hpp @@ -0,0 +1,65 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===----------------------------------------------------------------------===// + +#pragma once +#include +#include + +#include + +#include "dpnp4pybind11.hpp" +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern std::pair + copy_usm_ndarray_for_roll_1d(const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &dst, + py::ssize_t shift, + sycl::queue &exec_q, + const std::vector &depends = {}); + +extern std::pair + copy_usm_ndarray_for_roll_nd(const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &dst, + const std::vector &shifts, + sycl::queue &exec_q, + const std::vector &depends = {}); + +extern void init_copy_for_roll_dispatch_vectors(); + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp b/dpnp/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp new file mode 100644 index 000000000000..e97e8aeb1ca1 --- /dev/null +++ b/dpnp/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp @@ -0,0 +1,368 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include + +#include "kernels/copy_and_cast.hpp" +#include "utils/offset_utils.hpp" +#include "utils/output_validation.hpp" +#include "utils/sycl_alloc_utils.hpp" +#include "utils/type_dispatch.hpp" + +#include "copy_numpy_ndarray_into_usm_ndarray.hpp" +#include "simplify_iteration_space.hpp" + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace dpctl::tensor::py_internal +{ + +using dpctl::tensor::kernels::copy_and_cast:: + copy_and_cast_from_host_blocking_fn_ptr_t; + +static copy_and_cast_from_host_blocking_fn_ptr_t + copy_and_cast_from_host_blocking_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +using dpctl::tensor::kernels::copy_and_cast:: + copy_and_cast_from_host_contig_blocking_fn_ptr_t; + +static copy_and_cast_from_host_contig_blocking_fn_ptr_t + copy_and_cast_from_host_contig_blocking_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +void copy_numpy_ndarray_into_usm_ndarray( + const py::array &npy_src, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends) +{ + int src_ndim = npy_src.ndim(); + int dst_ndim = dst.get_ndim(); + + if (src_ndim != dst_ndim) { + throw py::value_error("Source ndarray and destination usm_ndarray have " + "different array ranks, " + "i.e. different number of indices needed to " + "address array elements."); + } + + const py::ssize_t *src_shape = npy_src.shape(); + const py::ssize_t *dst_shape = dst.get_shape_raw(); + bool shapes_equal(true); + std::size_t src_nelems(1); + for (int i = 0; shapes_equal && (i < src_ndim); ++i) { + shapes_equal = shapes_equal && (src_shape[i] == dst_shape[i]); + src_nelems *= static_cast(src_shape[i]); + } + + if (!shapes_equal) { + throw py::value_error("Source ndarray and destination usm_ndarray have " + "difference shapes."); + } + + if (src_nelems == 0) { + // nothing to do + return; + } + + dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(dst, src_nelems); + + if (!dpctl::utils::queues_are_compatible(exec_q, {dst})) { + throw py::value_error("Execution queue is not compatible with the " + "allocation queue"); + } + + dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst); + + // here we assume that NumPy's type numbers agree with ours for types + // supported in both + int src_typenum = + py::detail::array_descriptor_proxy(npy_src.dtype().ptr())->type_num; + int dst_typenum = dst.get_typenum(); + + const auto &array_types = td_ns::usm_ndarray_types(); + int src_type_id = array_types.typenum_to_lookup_id(src_typenum); + int dst_type_id = array_types.typenum_to_lookup_id(dst_typenum); + + py::buffer_info src_pybuf = npy_src.request(); + const char *const src_data = static_cast(src_pybuf.ptr); + char *dst_data = dst.get_data(); + + int src_flags = npy_src.flags(); + + // check for applicability of special cases: + // (same type && (both C-contiguous || both F-contiguous) + const bool both_c_contig = + ((src_flags & py::array::c_style) && dst.is_c_contiguous()); + const bool both_f_contig = + ((src_flags & py::array::f_style) && dst.is_f_contiguous()); + + const bool same_data_types = (src_type_id == dst_type_id); + + if (both_c_contig || both_f_contig) { + if (same_data_types) { + int src_elem_size = npy_src.itemsize(); + + sycl::event copy_ev = + exec_q.memcpy(static_cast(dst_data), + static_cast(src_data), + src_nelems * src_elem_size, depends); + + { + // wait for copy_ev to complete + // release GIL to allow other threads (host_tasks) + // a chance to acquire GIL + py::gil_scoped_release lock{}; + copy_ev.wait(); + } + + return; + } + else { + py::gil_scoped_release lock{}; + + auto copy_and_cast_from_host_contig_blocking_fn = + copy_and_cast_from_host_contig_blocking_dispatch_table + [dst_type_id][src_type_id]; + + static constexpr py::ssize_t zero_offset(0); + + copy_and_cast_from_host_contig_blocking_fn( + exec_q, src_nelems, src_data, zero_offset, dst_data, + zero_offset, depends); + + return; + } + } + + auto const &dst_strides = + dst.get_strides_vector(); // N.B.: strides in elements + + using shT = std::vector; + shT simplified_shape; + shT simplified_src_strides; + shT simplified_dst_strides; + py::ssize_t src_offset(0); + py::ssize_t dst_offset(0); + + int nd = src_ndim; + const py::ssize_t *shape = src_shape; + + const py::ssize_t *src_strides_p = + npy_src.strides(); // N.B.: strides in bytes + py::ssize_t src_itemsize = npy_src.itemsize(); // item size in bytes + + bool is_src_c_contig = ((src_flags & py::array::c_style) != 0); + bool is_src_f_contig = ((src_flags & py::array::f_style) != 0); + + shT src_strides_in_elems; + if (src_strides_p) { + src_strides_in_elems.resize(nd); + // copy and convert strides from bytes to elements + std::transform( + src_strides_p, src_strides_p + nd, std::begin(src_strides_in_elems), + [src_itemsize](py::ssize_t el) { + py::ssize_t q = el / src_itemsize; + if (q * src_itemsize != el) { + throw std::runtime_error( + "NumPy array strides are not multiple of itemsize"); + } + return q; + }); + } + else { + if (is_src_c_contig) { + src_strides_in_elems = + dpctl::tensor::c_contiguous_strides(nd, src_shape); + } + else if (is_src_f_contig) { + src_strides_in_elems = + dpctl::tensor::f_contiguous_strides(nd, src_shape); + } + else { + throw py::value_error("NumPy source array has null strides but is " + "neither C- nor F-contiguous."); + } + } + + // nd, simplified_* vectors and offsets are modified by reference + simplify_iteration_space(nd, shape, src_strides_in_elems, dst_strides, + // outputs + simplified_shape, simplified_src_strides, + simplified_dst_strides, src_offset, dst_offset); + + assert(simplified_shape.size() == static_cast(nd)); + assert(simplified_src_strides.size() == static_cast(nd)); + assert(simplified_dst_strides.size() == static_cast(nd)); + + // handle nd == 0 + if (nd == 0) { + nd = 1; + simplified_shape.reserve(nd); + simplified_shape.push_back(1); + + simplified_src_strides.reserve(nd); + simplified_src_strides.push_back(1); + + simplified_dst_strides.reserve(nd); + simplified_dst_strides.push_back(1); + } + + const bool is_contig_vector = + ((nd == 1) && (simplified_src_strides.front() == 1) && + (simplified_dst_strides.front() == 1)); + + const bool can_use_memcpy = (same_data_types && is_contig_vector && + (src_offset == 0) && (dst_offset == 0)); + + if (can_use_memcpy) { + int src_elem_size = npy_src.itemsize(); + + sycl::event copy_ev = exec_q.memcpy( + static_cast(dst_data), static_cast(src_data), + src_nelems * src_elem_size, depends); + + { + // wait for copy_ev to complete + // release GIL to allow other threads (host_tasks) + // a chance to acquire GIL + py::gil_scoped_release lock{}; + + copy_ev.wait(); + } + + return; + } + + // Minimum and maximum element offsets for source np.ndarray + py::ssize_t npy_src_min_nelem_offset(src_offset); + py::ssize_t npy_src_max_nelem_offset(src_offset); + for (int i = 0; i < nd; ++i) { + if (simplified_src_strides[i] < 0) { + npy_src_min_nelem_offset += + simplified_src_strides[i] * (simplified_shape[i] - 1); + } + else { + npy_src_max_nelem_offset += + simplified_src_strides[i] * (simplified_shape[i] - 1); + } + } + + if (is_contig_vector) { + // release GIL for the blocking call + py::gil_scoped_release lock{}; + + auto copy_and_cast_from_host_contig_blocking_fn = + copy_and_cast_from_host_contig_blocking_dispatch_table[dst_type_id] + [src_type_id]; + + copy_and_cast_from_host_contig_blocking_fn(exec_q, src_nelems, src_data, + src_offset, dst_data, + dst_offset, depends); + + return; + } + + std::vector host_task_events; + host_task_events.reserve(1); + + // Copy shape strides into device memory + using dpctl::tensor::offset_utils::device_allocate_and_pack; + auto ptr_size_event_tuple = device_allocate_and_pack( + exec_q, host_task_events, simplified_shape, simplified_src_strides, + simplified_dst_strides); + auto shape_strides_owner = std::move(std::get<0>(ptr_size_event_tuple)); + const sycl::event ©_shape_ev = std::get<2>(ptr_size_event_tuple); + const py::ssize_t *shape_strides = shape_strides_owner.get(); + + { + // release GIL for the blocking call + py::gil_scoped_release lock{}; + + // Get implementation function pointer + auto copy_and_cast_from_host_blocking_fn = + copy_and_cast_from_host_blocking_dispatch_table[dst_type_id] + [src_type_id]; + + copy_and_cast_from_host_blocking_fn( + exec_q, src_nelems, nd, shape_strides, src_data, src_offset, + npy_src_min_nelem_offset, npy_src_max_nelem_offset, dst_data, + dst_offset, depends, {copy_shape_ev}); + + // invoke USM deleter in smart pointer while GIL is held + shape_strides_owner.reset(nullptr); + } + + return; +} + +void init_copy_numpy_ndarray_into_usm_ndarray_dispatch_tables(void) +{ + using namespace td_ns; + using dpctl::tensor::kernels::copy_and_cast::CopyAndCastFromHostFactory; + + DispatchTableBuilder + dtb_copy_from_numpy; + + dtb_copy_from_numpy.populate_dispatch_table( + copy_and_cast_from_host_blocking_dispatch_table); + + using dpctl::tensor::kernels::copy_and_cast:: + CopyAndCastFromHostContigFactory; + + DispatchTableBuilder + dtb_copy_from_numpy_contig; + + dtb_copy_from_numpy_contig.populate_dispatch_table( + copy_and_cast_from_host_contig_blocking_dispatch_table); +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.hpp b/dpnp/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.hpp new file mode 100644 index 000000000000..f2de95f97cca --- /dev/null +++ b/dpnp/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.hpp @@ -0,0 +1,57 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===----------------------------------------------------------------------===// + +#pragma once +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern void copy_numpy_ndarray_into_usm_ndarray( + const py::array &npy_src, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends = {}); + +extern void init_copy_numpy_ndarray_into_usm_ndarray_dispatch_tables(void); + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/device_support_queries.cpp b/dpnp/tensor/libtensor/source/device_support_queries.cpp new file mode 100644 index 000000000000..6026520f3daa --- /dev/null +++ b/dpnp/tensor/libtensor/source/device_support_queries.cpp @@ -0,0 +1,173 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===--------------------------------------------------------------------===// + +#include + +#include "dpnp4pybind11.hpp" +#include + +#include + +namespace dpctl::tensor::py_internal +{ + +namespace py = pybind11; + +namespace +{ + +std::string _default_device_fp_type(const sycl::device &d) +{ + if (d.has(sycl::aspect::fp64)) { + return "f8"; + } + else { + return "f4"; + } +} + +int get_numpy_major_version() +{ + + py::module_ numpy = py::module_::import("numpy"); + py::str version_string = numpy.attr("__version__"); + py::module_ numpy_lib = py::module_::import("numpy.lib"); + + py::object numpy_version = numpy_lib.attr("NumpyVersion")(version_string); + int major_version = numpy_version.attr("major").cast(); + + return major_version; +} + +std::string _default_device_int_type(const sycl::device &) +{ + const int np_ver = get_numpy_major_version(); + + if (np_ver >= 2) { + return "i8"; + } + else { + // code for numpy.dtype('long') to be consistent + // with NumPy's default integer type across + // platforms. + return "l"; + } +} + +std::string _default_device_uint_type(const sycl::device &) +{ + const int np_ver = get_numpy_major_version(); + + if (np_ver >= 2) { + return "u8"; + } + else { + // code for numpy.dtype('long') to be consistent + // with NumPy's default integer type across + // platforms. + return "L"; + } +} + +std::string _default_device_complex_type(const sycl::device &d) +{ + if (d.has(sycl::aspect::fp64)) { + return "c16"; + } + else { + return "c8"; + } +} + +std::string _default_device_bool_type(const sycl::device &) { return "b1"; } + +std::string _default_device_index_type(const sycl::device &) { return "i8"; } + +sycl::device _extract_device(const py::object &arg) +{ + auto const &api = dpctl::detail::dpctl_capi::get(); + + PyObject *source = arg.ptr(); + if (api.PySyclQueue_Check_(source)) { + const sycl::queue &q = py::cast(arg); + return q.get_device(); + } + else if (api.PySyclDevice_Check_(source)) { + return py::cast(arg); + } + else { + throw py::type_error( + "Expected type `dpctl.SyclQueue` or `dpctl.SyclDevice`."); + } +} + +} // namespace + +std::string default_device_fp_type(const py::object &arg) +{ + const sycl::device &d = _extract_device(arg); + return _default_device_fp_type(d); +} + +std::string default_device_int_type(const py::object &arg) +{ + const sycl::device &d = _extract_device(arg); + return _default_device_int_type(d); +} + +std::string default_device_uint_type(const py::object &arg) +{ + const sycl::device &d = _extract_device(arg); + return _default_device_uint_type(d); +} + +std::string default_device_bool_type(const py::object &arg) +{ + const sycl::device &d = _extract_device(arg); + return _default_device_bool_type(d); +} + +std::string default_device_complex_type(const py::object &arg) +{ + const sycl::device &d = _extract_device(arg); + return _default_device_complex_type(d); +} + +std::string default_device_index_type(const py::object &arg) +{ + const sycl::device &d = _extract_device(arg); + return _default_device_index_type(d); +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/device_support_queries.hpp b/dpnp/tensor/libtensor/source/device_support_queries.hpp new file mode 100644 index 000000000000..adde7aefe3dd --- /dev/null +++ b/dpnp/tensor/libtensor/source/device_support_queries.hpp @@ -0,0 +1,50 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===--------------------------------------------------------------------===// + +#pragma once +#include + +#include "dpnp4pybind11.hpp" +#include + +namespace dpctl::tensor::py_internal +{ + +extern std::string default_device_fp_type(const py::object &); +extern std::string default_device_int_type(const py::object &); +extern std::string default_device_uint_type(const py::object &); +extern std::string default_device_bool_type(const py::object &); +extern std::string default_device_complex_type(const py::object &); +extern std::string default_device_index_type(const py::object &); + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/abs.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/abs.cpp new file mode 100644 index 000000000000..067a201099de --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/abs.cpp @@ -0,0 +1,125 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include +#include + +#include "abs.hpp" +#include "elementwise_functions.hpp" +#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" + +#include "kernels/elementwise_functions/abs.hpp" +#include "kernels/elementwise_functions/common.hpp" + +namespace dpctl::tensor::py_internal +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::unary_contig_impl_fn_ptr_t; +using ew_cmn_ns::unary_strided_impl_fn_ptr_t; + +// U01: ==== ABS (x) +namespace impl +{ + +namespace abs_fn_ns = dpctl::tensor::kernels::abs; + +static unary_contig_impl_fn_ptr_t abs_contig_dispatch_vector[td_ns::num_types]; +static int abs_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + abs_strided_dispatch_vector[td_ns::num_types]; + +void populate_abs_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = abs_fn_ns; + + using fn_ns::AbsContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(abs_contig_dispatch_vector); + + using fn_ns::AbsStridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(abs_strided_dispatch_vector); + + using fn_ns::AbsTypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(abs_output_typeid_vector); +}; + +} // namespace impl + +void init_abs(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_abs_dispatch_vectors(); + using impl::abs_contig_dispatch_vector; + using impl::abs_output_typeid_vector; + using impl::abs_strided_dispatch_vector; + + auto abs_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc( + src, dst, exec_q, depends, abs_output_typeid_vector, + abs_contig_dispatch_vector, abs_strided_dispatch_vector); + }; + m.def("_abs", abs_pyapi, "", py::arg("src"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto abs_result_type_pyapi = [&](const py::dtype &dtype) { + return py_unary_ufunc_result_type(dtype, abs_output_typeid_vector); + }; + m.def("_abs_result_type", abs_result_type_pyapi); + } +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/abs.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/abs.hpp new file mode 100644 index 000000000000..b496f1e694ac --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/abs.hpp @@ -0,0 +1,46 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern void init_abs(py::module_ m); + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/acos.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/acos.cpp new file mode 100644 index 000000000000..52d962cd828e --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/acos.cpp @@ -0,0 +1,125 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include +#include + +#include "acos.hpp" +#include "elementwise_functions.hpp" +#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" + +#include "kernels/elementwise_functions/acos.hpp" +#include "kernels/elementwise_functions/common.hpp" + +namespace dpctl::tensor::py_internal +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::unary_contig_impl_fn_ptr_t; +using ew_cmn_ns::unary_strided_impl_fn_ptr_t; + +// U02: ==== ACOS (x) +namespace impl +{ + +namespace acos_fn_ns = dpctl::tensor::kernels::acos; + +static unary_contig_impl_fn_ptr_t acos_contig_dispatch_vector[td_ns::num_types]; +static int acos_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + acos_strided_dispatch_vector[td_ns::num_types]; + +void populate_acos_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = acos_fn_ns; + + using fn_ns::AcosContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(acos_contig_dispatch_vector); + + using fn_ns::AcosStridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(acos_strided_dispatch_vector); + + using fn_ns::AcosTypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(acos_output_typeid_vector); +}; + +} // namespace impl + +void init_acos(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_acos_dispatch_vectors(); + using impl::acos_contig_dispatch_vector; + using impl::acos_output_typeid_vector; + using impl::acos_strided_dispatch_vector; + + auto acos_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc( + src, dst, exec_q, depends, acos_output_typeid_vector, + acos_contig_dispatch_vector, acos_strided_dispatch_vector); + }; + m.def("_acos", acos_pyapi, "", py::arg("src"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto acos_result_type_pyapi = [&](const py::dtype &dtype) { + return py_unary_ufunc_result_type(dtype, acos_output_typeid_vector); + }; + m.def("_acos_result_type", acos_result_type_pyapi); + } +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/acos.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/acos.hpp new file mode 100644 index 000000000000..608b684c4e18 --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/acos.hpp @@ -0,0 +1,46 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern void init_acos(py::module_ m); + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/acosh.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/acosh.cpp new file mode 100644 index 000000000000..c2334804e422 --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/acosh.cpp @@ -0,0 +1,127 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include +#include + +#include "acosh.hpp" +#include "elementwise_functions.hpp" +#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" + +#include "kernels/elementwise_functions/acosh.hpp" +#include "kernels/elementwise_functions/common.hpp" + +namespace dpctl::tensor::py_internal +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::unary_contig_impl_fn_ptr_t; +using ew_cmn_ns::unary_strided_impl_fn_ptr_t; + +// U03: ==== ACOSH (x) +namespace impl +{ + +namespace acosh_fn_ns = dpctl::tensor::kernels::acosh; + +static unary_contig_impl_fn_ptr_t + acosh_contig_dispatch_vector[td_ns::num_types]; +static int acosh_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + acosh_strided_dispatch_vector[td_ns::num_types]; + +void populate_acosh_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = acosh_fn_ns; + + using fn_ns::AcoshContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(acosh_contig_dispatch_vector); + + using fn_ns::AcoshStridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(acosh_strided_dispatch_vector); + + using fn_ns::AcoshTypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(acosh_output_typeid_vector); +}; + +} // namespace impl + +void init_acosh(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_acosh_dispatch_vectors(); + using impl::acosh_contig_dispatch_vector; + using impl::acosh_output_typeid_vector; + using impl::acosh_strided_dispatch_vector; + + auto acosh_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc( + src, dst, exec_q, depends, acosh_output_typeid_vector, + acosh_contig_dispatch_vector, acosh_strided_dispatch_vector); + }; + m.def("_acosh", acosh_pyapi, "", py::arg("src"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto acosh_result_type_pyapi = [&](const py::dtype &dtype) { + return py_unary_ufunc_result_type(dtype, + acosh_output_typeid_vector); + }; + m.def("_acosh_result_type", acosh_result_type_pyapi); + } +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/acosh.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/acosh.hpp new file mode 100644 index 000000000000..fc74fa99874f --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/acosh.hpp @@ -0,0 +1,46 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern void init_acosh(py::module_ m); + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/add.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/add.cpp new file mode 100644 index 000000000000..e37fad67e294 --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/add.cpp @@ -0,0 +1,243 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include +#include + +#include "add.hpp" +#include "elementwise_functions.hpp" +#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" + +#include "kernels/elementwise_functions/add.hpp" +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/common_inplace.hpp" + +namespace dpctl::tensor::py_internal +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::binary_contig_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_strided_impl_fn_ptr_t; + +using ew_cmn_ns::binary_inplace_contig_impl_fn_ptr_t; +using ew_cmn_ns::binary_inplace_row_matrix_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_inplace_strided_impl_fn_ptr_t; + +// B01: ===== ADD (x1, x2) +namespace impl +{ + +namespace add_fn_ns = dpctl::tensor::kernels::add; + +static binary_contig_impl_fn_ptr_t add_contig_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +static int add_output_id_table[td_ns::num_types][td_ns::num_types]; +static int add_inplace_output_id_table[td_ns::num_types][td_ns::num_types]; + +static binary_strided_impl_fn_ptr_t + add_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; + +// add(matrix, row) +static binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t + add_contig_matrix_contig_row_broadcast_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +// add(row, matrix) +static binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t + add_contig_row_contig_matrix_broadcast_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +static binary_inplace_contig_impl_fn_ptr_t + add_inplace_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; +static binary_inplace_strided_impl_fn_ptr_t + add_inplace_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; +static binary_inplace_row_matrix_broadcast_impl_fn_ptr_t + add_inplace_row_matrix_dispatch_table[td_ns::num_types][td_ns::num_types]; + +void populate_add_dispatch_tables(void) +{ + using namespace td_ns; + namespace fn_ns = add_fn_ns; + + // which input types are supported, and what is the type of the result + using fn_ns::AddTypeMapFactory; + DispatchTableBuilder dtb1; + dtb1.populate_dispatch_table(add_output_id_table); + + // function pointers for operation on general strided arrays + using fn_ns::AddStridedFactory; + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(add_strided_dispatch_table); + + // function pointers for operation on contiguous inputs and output + using fn_ns::AddContigFactory; + DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table(add_contig_dispatch_table); + + // function pointers for operation on contiguous matrix, contiguous row + // with contiguous matrix output + using fn_ns::AddContigMatrixContigRowBroadcastFactory; + DispatchTableBuilder< + binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t, + AddContigMatrixContigRowBroadcastFactory, num_types> + dtb4; + dtb4.populate_dispatch_table( + add_contig_matrix_contig_row_broadcast_dispatch_table); + + // function pointers for operation on contiguous row, contiguous matrix + // with contiguous matrix output + using fn_ns::AddContigRowContigMatrixBroadcastFactory; + DispatchTableBuilder< + binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t, + AddContigRowContigMatrixBroadcastFactory, num_types> + dtb5; + dtb5.populate_dispatch_table( + add_contig_row_contig_matrix_broadcast_dispatch_table); + + // function pointers for inplace operation on general strided arrays + using fn_ns::AddInplaceStridedFactory; + DispatchTableBuilder + dtb6; + dtb6.populate_dispatch_table(add_inplace_strided_dispatch_table); + + // function pointers for inplace operation on contiguous inputs and output + using fn_ns::AddInplaceContigFactory; + DispatchTableBuilder + dtb7; + dtb7.populate_dispatch_table(add_inplace_contig_dispatch_table); + + // function pointers for inplace operation on contiguous matrix + // and contiguous row + using fn_ns::AddInplaceRowMatrixBroadcastFactory; + DispatchTableBuilder + dtb8; + dtb8.populate_dispatch_table(add_inplace_row_matrix_dispatch_table); + + // which types are supported by the in-place kernels + using fn_ns::AddInplaceTypeMapFactory; + DispatchTableBuilder dtb9; + dtb9.populate_dispatch_table(add_inplace_output_id_table); +}; + +} // namespace impl + +void init_add(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_add_dispatch_tables(); + using impl::add_contig_dispatch_table; + using impl::add_contig_matrix_contig_row_broadcast_dispatch_table; + using impl::add_contig_row_contig_matrix_broadcast_dispatch_table; + using impl::add_output_id_table; + using impl::add_strided_dispatch_table; + + auto add_pyapi = [&](const arrayT &src1, const arrayT &src2, + const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_binary_ufunc( + src1, src2, dst, exec_q, depends, add_output_id_table, + // function pointers to handle operation on contiguous arrays + // (pointers may be nullptr) + add_contig_dispatch_table, + // function pointers to handle operation on strided arrays (most + // general case) + add_strided_dispatch_table, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + add_contig_matrix_contig_row_broadcast_dispatch_table, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + add_contig_row_contig_matrix_broadcast_dispatch_table); + }; + auto add_result_type_pyapi = [&](const py::dtype &dtype1, + const py::dtype &dtype2) { + return py_binary_ufunc_result_type(dtype1, dtype2, + add_output_id_table); + }; + m.def("_add", add_pyapi, "", py::arg("src1"), py::arg("src2"), + py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + m.def("_add_result_type", add_result_type_pyapi, ""); + + using impl::add_inplace_contig_dispatch_table; + using impl::add_inplace_output_id_table; + using impl::add_inplace_row_matrix_dispatch_table; + using impl::add_inplace_strided_dispatch_table; + + auto add_inplace_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_binary_inplace_ufunc( + src, dst, exec_q, depends, add_inplace_output_id_table, + // function pointers to handle inplace operation on + // contiguous arrays (pointers may be nullptr) + add_inplace_contig_dispatch_table, + // function pointers to handle inplace operation on strided + // arrays (most general case) + add_inplace_strided_dispatch_table, + // function pointers to handle inplace operation on + // c-contig matrix with c-contig row with broadcasting + // (may be nullptr) + add_inplace_row_matrix_dispatch_table); + }; + m.def("_add_inplace", add_inplace_pyapi, "", py::arg("lhs"), + py::arg("rhs"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + } +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/add.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/add.hpp new file mode 100644 index 000000000000..0797adb79ddb --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/add.hpp @@ -0,0 +1,46 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern void init_add(py::module_ m); + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/angle.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/angle.cpp new file mode 100644 index 000000000000..df2b97fe7644 --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/angle.cpp @@ -0,0 +1,127 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include +#include + +#include "angle.hpp" +#include "elementwise_functions.hpp" +#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" + +#include "kernels/elementwise_functions/angle.hpp" +#include "kernels/elementwise_functions/common.hpp" + +namespace dpctl::tensor::py_internal +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::unary_contig_impl_fn_ptr_t; +using ew_cmn_ns::unary_strided_impl_fn_ptr_t; + +// U43: ==== ANGLE (x) +namespace impl +{ + +namespace angle_fn_ns = dpctl::tensor::kernels::angle; + +static unary_contig_impl_fn_ptr_t + angle_contig_dispatch_vector[td_ns::num_types]; +static int angle_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + angle_strided_dispatch_vector[td_ns::num_types]; + +void populate_angle_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = angle_fn_ns; + + using fn_ns::AngleContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(angle_contig_dispatch_vector); + + using fn_ns::AngleStridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(angle_strided_dispatch_vector); + + using fn_ns::AngleTypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(angle_output_typeid_vector); +}; + +} // namespace impl + +void init_angle(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_angle_dispatch_vectors(); + using impl::angle_contig_dispatch_vector; + using impl::angle_output_typeid_vector; + using impl::angle_strided_dispatch_vector; + + auto angle_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc( + src, dst, exec_q, depends, angle_output_typeid_vector, + angle_contig_dispatch_vector, angle_strided_dispatch_vector); + }; + m.def("_angle", angle_pyapi, "", py::arg("src"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto angle_result_type_pyapi = [&](const py::dtype &dtype) { + return py_unary_ufunc_result_type(dtype, + angle_output_typeid_vector); + }; + m.def("_angle_result_type", angle_result_type_pyapi); + } +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/angle.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/angle.hpp new file mode 100644 index 000000000000..73071b945d7b --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/angle.hpp @@ -0,0 +1,46 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern void init_angle(py::module_ m); + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/asin.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/asin.cpp new file mode 100644 index 000000000000..32d71c67527e --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/asin.cpp @@ -0,0 +1,125 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include +#include + +#include "asin.hpp" +#include "elementwise_functions.hpp" +#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" + +#include "kernels/elementwise_functions/asin.hpp" +#include "kernels/elementwise_functions/common.hpp" + +namespace dpctl::tensor::py_internal +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::unary_contig_impl_fn_ptr_t; +using ew_cmn_ns::unary_strided_impl_fn_ptr_t; + +// U04: ==== ASIN (x) +namespace impl +{ + +namespace asin_fn_ns = dpctl::tensor::kernels::asin; + +static unary_contig_impl_fn_ptr_t asin_contig_dispatch_vector[td_ns::num_types]; +static int asin_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + asin_strided_dispatch_vector[td_ns::num_types]; + +void populate_asin_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = asin_fn_ns; + + using fn_ns::AsinContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(asin_contig_dispatch_vector); + + using fn_ns::AsinStridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(asin_strided_dispatch_vector); + + using fn_ns::AsinTypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(asin_output_typeid_vector); +}; + +} // namespace impl + +void init_asin(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_asin_dispatch_vectors(); + using impl::asin_contig_dispatch_vector; + using impl::asin_output_typeid_vector; + using impl::asin_strided_dispatch_vector; + + auto asin_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc( + src, dst, exec_q, depends, asin_output_typeid_vector, + asin_contig_dispatch_vector, asin_strided_dispatch_vector); + }; + m.def("_asin", asin_pyapi, "", py::arg("src"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto asin_result_type_pyapi = [&](const py::dtype &dtype) { + return py_unary_ufunc_result_type(dtype, asin_output_typeid_vector); + }; + m.def("_asin_result_type", asin_result_type_pyapi); + } +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/asin.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/asin.hpp new file mode 100644 index 000000000000..39230000bdfc --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/asin.hpp @@ -0,0 +1,46 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern void init_asin(py::module_ m); + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/asinh.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/asinh.cpp new file mode 100644 index 000000000000..47f8a7dbf190 --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/asinh.cpp @@ -0,0 +1,127 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include +#include + +#include "asinh.hpp" +#include "elementwise_functions.hpp" +#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" + +#include "kernels/elementwise_functions/asinh.hpp" +#include "kernels/elementwise_functions/common.hpp" + +namespace dpctl::tensor::py_internal +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::unary_contig_impl_fn_ptr_t; +using ew_cmn_ns::unary_strided_impl_fn_ptr_t; + +// U05: ==== ASINH (x) +namespace impl +{ + +namespace asinh_fn_ns = dpctl::tensor::kernels::asinh; + +static unary_contig_impl_fn_ptr_t + asinh_contig_dispatch_vector[td_ns::num_types]; +static int asinh_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + asinh_strided_dispatch_vector[td_ns::num_types]; + +void populate_asinh_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = asinh_fn_ns; + + using fn_ns::AsinhContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(asinh_contig_dispatch_vector); + + using fn_ns::AsinhStridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(asinh_strided_dispatch_vector); + + using fn_ns::AsinhTypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(asinh_output_typeid_vector); +}; + +} // namespace impl + +void init_asinh(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_asinh_dispatch_vectors(); + using impl::asinh_contig_dispatch_vector; + using impl::asinh_output_typeid_vector; + using impl::asinh_strided_dispatch_vector; + + auto asinh_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc( + src, dst, exec_q, depends, asinh_output_typeid_vector, + asinh_contig_dispatch_vector, asinh_strided_dispatch_vector); + }; + m.def("_asinh", asinh_pyapi, "", py::arg("src"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto asinh_result_type_pyapi = [&](const py::dtype &dtype) { + return py_unary_ufunc_result_type(dtype, + asinh_output_typeid_vector); + }; + m.def("_asinh_result_type", asinh_result_type_pyapi); + } +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/asinh.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/asinh.hpp new file mode 100644 index 000000000000..0d761f082ae3 --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/asinh.hpp @@ -0,0 +1,46 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern void init_asinh(py::module_ m); + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/atan.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/atan.cpp new file mode 100644 index 000000000000..74ee82edbbc9 --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/atan.cpp @@ -0,0 +1,125 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include +#include + +#include "atan.hpp" +#include "elementwise_functions.hpp" +#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" + +#include "kernels/elementwise_functions/atan.hpp" +#include "kernels/elementwise_functions/common.hpp" + +namespace dpctl::tensor::py_internal +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::unary_contig_impl_fn_ptr_t; +using ew_cmn_ns::unary_strided_impl_fn_ptr_t; + +// U06: ==== ATAN (x) +namespace impl +{ + +namespace atan_fn_ns = dpctl::tensor::kernels::atan; + +static unary_contig_impl_fn_ptr_t atan_contig_dispatch_vector[td_ns::num_types]; +static int atan_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + atan_strided_dispatch_vector[td_ns::num_types]; + +void populate_atan_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = atan_fn_ns; + + using fn_ns::AtanContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(atan_contig_dispatch_vector); + + using fn_ns::AtanStridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(atan_strided_dispatch_vector); + + using fn_ns::AtanTypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(atan_output_typeid_vector); +}; + +} // namespace impl + +void init_atan(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_atan_dispatch_vectors(); + using impl::atan_contig_dispatch_vector; + using impl::atan_output_typeid_vector; + using impl::atan_strided_dispatch_vector; + + auto atan_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc( + src, dst, exec_q, depends, atan_output_typeid_vector, + atan_contig_dispatch_vector, atan_strided_dispatch_vector); + }; + m.def("_atan", atan_pyapi, "", py::arg("src"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto atan_result_type_pyapi = [&](const py::dtype &dtype) { + return py_unary_ufunc_result_type(dtype, atan_output_typeid_vector); + }; + m.def("_atan_result_type", atan_result_type_pyapi); + } +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/atan.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/atan.hpp new file mode 100644 index 000000000000..c4eb3f3baf92 --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/atan.hpp @@ -0,0 +1,46 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern void init_atan(py::module_ m); + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/atan2.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/atan2.cpp new file mode 100644 index 000000000000..60bb2e081fef --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/atan2.cpp @@ -0,0 +1,146 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include +#include + +#include "atan2.hpp" +#include "elementwise_functions.hpp" +#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" + +#include "kernels/elementwise_functions/atan2.hpp" +#include "kernels/elementwise_functions/common.hpp" + +namespace dpctl::tensor::py_internal +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::binary_contig_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_strided_impl_fn_ptr_t; + +// B02: ===== ATAN2 (x1, x2) +namespace impl +{ +namespace atan2_fn_ns = dpctl::tensor::kernels::atan2; + +static binary_contig_impl_fn_ptr_t + atan2_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; +static int atan2_output_id_table[td_ns::num_types][td_ns::num_types]; + +static binary_strided_impl_fn_ptr_t + atan2_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; + +void populate_atan2_dispatch_tables(void) +{ + using namespace td_ns; + namespace fn_ns = atan2_fn_ns; + + // which input types are supported, and what is the type of the result + using fn_ns::Atan2TypeMapFactory; + DispatchTableBuilder dtb1; + dtb1.populate_dispatch_table(atan2_output_id_table); + + // function pointers for operation on general strided arrays + using fn_ns::Atan2StridedFactory; + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(atan2_strided_dispatch_table); + + // function pointers for operation on contiguous inputs and output + using fn_ns::Atan2ContigFactory; + DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table(atan2_contig_dispatch_table); +}; + +} // namespace impl + +void init_atan2(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_atan2_dispatch_tables(); + using impl::atan2_contig_dispatch_table; + using impl::atan2_output_id_table; + using impl::atan2_strided_dispatch_table; + + auto atan2_pyapi = [&](const arrayT &src1, const arrayT &src2, + const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_binary_ufunc( + src1, src2, dst, exec_q, depends, atan2_output_id_table, + // function pointers to handle operation on contiguous arrays + // (pointers may be nullptr) + atan2_contig_dispatch_table, + // function pointers to handle operation on strided arrays (most + // general case) + atan2_strided_dispatch_table, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{}, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{}); + }; + auto atan2_result_type_pyapi = [&](const py::dtype &dtype1, + const py::dtype &dtype2) { + return py_binary_ufunc_result_type(dtype1, dtype2, + atan2_output_id_table); + }; + m.def("_atan2", atan2_pyapi, "", py::arg("src1"), py::arg("src2"), + py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + m.def("_atan2_result_type", atan2_result_type_pyapi, ""); + } +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/atan2.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/atan2.hpp new file mode 100644 index 000000000000..5bdf9b74db2e --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/atan2.hpp @@ -0,0 +1,46 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern void init_atan2(py::module_ m); + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/atanh.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/atanh.cpp new file mode 100644 index 000000000000..2857f9ab8c10 --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/atanh.cpp @@ -0,0 +1,127 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include +#include + +#include "atanh.hpp" +#include "elementwise_functions.hpp" +#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" + +#include "kernels/elementwise_functions/atanh.hpp" +#include "kernels/elementwise_functions/common.hpp" + +namespace dpctl::tensor::py_internal +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::unary_contig_impl_fn_ptr_t; +using ew_cmn_ns::unary_strided_impl_fn_ptr_t; + +// U07: ==== ATANH (x) +namespace impl +{ + +namespace atanh_fn_ns = dpctl::tensor::kernels::atanh; + +static unary_contig_impl_fn_ptr_t + atanh_contig_dispatch_vector[td_ns::num_types]; +static int atanh_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + atanh_strided_dispatch_vector[td_ns::num_types]; + +void populate_atanh_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = atanh_fn_ns; + + using fn_ns::AtanhContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(atanh_contig_dispatch_vector); + + using fn_ns::AtanhStridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(atanh_strided_dispatch_vector); + + using fn_ns::AtanhTypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(atanh_output_typeid_vector); +}; + +} // namespace impl + +void init_atanh(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_atanh_dispatch_vectors(); + using impl::atanh_contig_dispatch_vector; + using impl::atanh_output_typeid_vector; + using impl::atanh_strided_dispatch_vector; + + auto atanh_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc( + src, dst, exec_q, depends, atanh_output_typeid_vector, + atanh_contig_dispatch_vector, atanh_strided_dispatch_vector); + }; + m.def("_atanh", atanh_pyapi, "", py::arg("src"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto atanh_result_type_pyapi = [&](const py::dtype &dtype) { + return py_unary_ufunc_result_type(dtype, + atanh_output_typeid_vector); + }; + m.def("_atanh_result_type", atanh_result_type_pyapi); + } +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/atanh.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/atanh.hpp new file mode 100644 index 000000000000..5604e48deef6 --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/atanh.hpp @@ -0,0 +1,46 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern void init_atanh(py::module_ m); + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/bitwise_and.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/bitwise_and.cpp new file mode 100644 index 000000000000..3976f480ff6d --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/bitwise_and.cpp @@ -0,0 +1,206 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include +#include + +#include "bitwise_and.hpp" +#include "elementwise_functions.hpp" +#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" + +#include "kernels/elementwise_functions/bitwise_and.hpp" +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/common_inplace.hpp" + +namespace dpctl::tensor::py_internal +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::binary_contig_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_strided_impl_fn_ptr_t; + +using ew_cmn_ns::binary_inplace_contig_impl_fn_ptr_t; +using ew_cmn_ns::binary_inplace_row_matrix_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_inplace_strided_impl_fn_ptr_t; + +// B03: ===== BITWISE_AND (x1, x2) +namespace impl +{ +namespace bitwise_and_fn_ns = dpctl::tensor::kernels::bitwise_and; + +static binary_contig_impl_fn_ptr_t + bitwise_and_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; + +static int bitwise_and_output_id_table[td_ns::num_types][td_ns::num_types]; +static int bitwise_and_inplace_output_id_table[td_ns::num_types] + [td_ns::num_types]; + +static binary_strided_impl_fn_ptr_t + bitwise_and_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; + +static binary_inplace_contig_impl_fn_ptr_t + bitwise_and_inplace_contig_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static binary_inplace_strided_impl_fn_ptr_t + bitwise_and_inplace_strided_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +void populate_bitwise_and_dispatch_tables(void) +{ + using namespace td_ns; + namespace fn_ns = bitwise_and_fn_ns; + + // which input types are supported, and what is the type of the result + using fn_ns::BitwiseAndTypeMapFactory; + DispatchTableBuilder dtb1; + dtb1.populate_dispatch_table(bitwise_and_output_id_table); + + // function pointers for operation on general strided arrays + using fn_ns::BitwiseAndStridedFactory; + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(bitwise_and_strided_dispatch_table); + + // function pointers for operation on contiguous inputs and output + using fn_ns::BitwiseAndContigFactory; + DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table(bitwise_and_contig_dispatch_table); + + // function pointers for inplace operation on general strided arrays + using fn_ns::BitwiseAndInplaceStridedFactory; + DispatchTableBuilder + dtb4; + dtb4.populate_dispatch_table(bitwise_and_inplace_strided_dispatch_table); + + // function pointers for inplace operation on contiguous inputs and output + using fn_ns::BitwiseAndInplaceContigFactory; + DispatchTableBuilder + dtb5; + dtb5.populate_dispatch_table(bitwise_and_inplace_contig_dispatch_table); + + // which types are supported by the in-place kernels + using fn_ns::BitwiseAndInplaceTypeMapFactory; + DispatchTableBuilder dtb6; + dtb6.populate_dispatch_table(bitwise_and_inplace_output_id_table); +}; + +} // namespace impl + +void init_bitwise_and(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_bitwise_and_dispatch_tables(); + using impl::bitwise_and_contig_dispatch_table; + using impl::bitwise_and_output_id_table; + using impl::bitwise_and_strided_dispatch_table; + + auto bitwise_and_pyapi = [&](const arrayT &src1, const arrayT &src2, + const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_binary_ufunc( + src1, src2, dst, exec_q, depends, bitwise_and_output_id_table, + // function pointers to handle operation on contiguous arrays + // (pointers may be nullptr) + bitwise_and_contig_dispatch_table, + // function pointers to handle operation on strided arrays (most + // general case) + bitwise_and_strided_dispatch_table, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{}, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{}); + }; + auto bitwise_and_result_type_pyapi = [&](const py::dtype &dtype1, + const py::dtype &dtype2) { + return py_binary_ufunc_result_type(dtype1, dtype2, + bitwise_and_output_id_table); + }; + m.def("_bitwise_and", bitwise_and_pyapi, "", py::arg("src1"), + py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + m.def("_bitwise_and_result_type", bitwise_and_result_type_pyapi, ""); + + using impl::bitwise_and_inplace_contig_dispatch_table; + using impl::bitwise_and_inplace_output_id_table; + using impl::bitwise_and_inplace_strided_dispatch_table; + + auto bitwise_and_inplace_pyapi = [&](const arrayT &src, + const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_binary_inplace_ufunc( + src, dst, exec_q, depends, bitwise_and_inplace_output_id_table, + // function pointers to handle inplace operation on + // contiguous arrays (pointers may be nullptr) + bitwise_and_inplace_contig_dispatch_table, + // function pointers to handle inplace operation on strided + // arrays (most general case) + bitwise_and_inplace_strided_dispatch_table, + // function pointers to handle inplace operation on + // c-contig matrix with c-contig row with broadcasting + // (may be nullptr) + td_ns::NullPtrTable< + binary_inplace_row_matrix_broadcast_impl_fn_ptr_t>{}); + }; + m.def("_bitwise_and_inplace", bitwise_and_inplace_pyapi, "", + py::arg("lhs"), py::arg("rhs"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + } +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/bitwise_and.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/bitwise_and.hpp new file mode 100644 index 000000000000..19f29ae8822e --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/bitwise_and.hpp @@ -0,0 +1,46 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern void init_bitwise_and(py::module_ m); + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/bitwise_invert.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/bitwise_invert.cpp new file mode 100644 index 000000000000..05e7f4eeb61b --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/bitwise_invert.cpp @@ -0,0 +1,129 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include +#include + +#include "bitwise_invert.hpp" +#include "elementwise_functions.hpp" +#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" + +#include "kernels/elementwise_functions/bitwise_invert.hpp" +#include "kernels/elementwise_functions/common.hpp" + +namespace dpctl::tensor::py_internal +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::unary_contig_impl_fn_ptr_t; +using ew_cmn_ns::unary_strided_impl_fn_ptr_t; + +// U08: ===== BITWISE_INVERT (x) +namespace impl +{ + +namespace bitwise_invert_fn_ns = dpctl::tensor::kernels::bitwise_invert; + +static unary_contig_impl_fn_ptr_t + bitwise_invert_contig_dispatch_vector[td_ns::num_types]; +static int bitwise_invert_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + bitwise_invert_strided_dispatch_vector[td_ns::num_types]; + +void populate_bitwise_invert_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = bitwise_invert_fn_ns; + + using fn_ns::BitwiseInvertContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(bitwise_invert_contig_dispatch_vector); + + using fn_ns::BitwiseInvertStridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(bitwise_invert_strided_dispatch_vector); + + using fn_ns::BitwiseInvertTypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(bitwise_invert_output_typeid_vector); +}; + +} // namespace impl + +void init_bitwise_invert(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_bitwise_invert_dispatch_vectors(); + using impl::bitwise_invert_contig_dispatch_vector; + using impl::bitwise_invert_output_typeid_vector; + using impl::bitwise_invert_strided_dispatch_vector; + + auto bitwise_invert_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc(src, dst, exec_q, depends, + bitwise_invert_output_typeid_vector, + bitwise_invert_contig_dispatch_vector, + bitwise_invert_strided_dispatch_vector); + }; + m.def("_bitwise_invert", bitwise_invert_pyapi, "", py::arg("src"), + py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + + auto bitwise_invert_result_type_pyapi = [&](const py::dtype &dtype) { + return py_unary_ufunc_result_type( + dtype, bitwise_invert_output_typeid_vector); + }; + m.def("_bitwise_invert_result_type", bitwise_invert_result_type_pyapi); + } +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/bitwise_invert.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/bitwise_invert.hpp new file mode 100644 index 000000000000..e20c0df3cf11 --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/bitwise_invert.hpp @@ -0,0 +1,46 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern void init_bitwise_invert(py::module_ m); + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/bitwise_left_shift.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/bitwise_left_shift.cpp new file mode 100644 index 000000000000..c26c9a42864f --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/bitwise_left_shift.cpp @@ -0,0 +1,216 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include +#include + +#include "bitwise_left_shift.hpp" +#include "elementwise_functions.hpp" +#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" + +#include "kernels/elementwise_functions/bitwise_left_shift.hpp" +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/common_inplace.hpp" + +namespace dpctl::tensor::py_internal +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::binary_contig_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_strided_impl_fn_ptr_t; + +using ew_cmn_ns::binary_inplace_contig_impl_fn_ptr_t; +using ew_cmn_ns::binary_inplace_row_matrix_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_inplace_strided_impl_fn_ptr_t; + +// B04: ===== BITWISE_LEFT_SHIFT (x1, x2) +namespace impl +{ +namespace bitwise_left_shift_fn_ns = dpctl::tensor::kernels::bitwise_left_shift; + +static binary_contig_impl_fn_ptr_t + bitwise_left_shift_contig_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +static int bitwise_left_shift_output_id_table[td_ns::num_types] + [td_ns::num_types]; +static int bitwise_left_shift_inplace_output_id_table[td_ns::num_types] + [td_ns::num_types]; + +static binary_strided_impl_fn_ptr_t + bitwise_left_shift_strided_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +static binary_inplace_contig_impl_fn_ptr_t + bitwise_left_shift_inplace_contig_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static binary_inplace_strided_impl_fn_ptr_t + bitwise_left_shift_inplace_strided_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +void populate_bitwise_left_shift_dispatch_tables(void) +{ + using namespace td_ns; + namespace fn_ns = bitwise_left_shift_fn_ns; + + // which input types are supported, and what is the type of the result + using fn_ns::BitwiseLeftShiftTypeMapFactory; + DispatchTableBuilder dtb1; + dtb1.populate_dispatch_table(bitwise_left_shift_output_id_table); + + // function pointers for operation on general strided arrays + using fn_ns::BitwiseLeftShiftStridedFactory; + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(bitwise_left_shift_strided_dispatch_table); + + // function pointers for operation on contiguous inputs and output + using fn_ns::BitwiseLeftShiftContigFactory; + DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table(bitwise_left_shift_contig_dispatch_table); + + // function pointers for inplace operation on general strided arrays + using fn_ns::BitwiseLeftShiftInplaceStridedFactory; + DispatchTableBuilder + dtb4; + dtb4.populate_dispatch_table( + bitwise_left_shift_inplace_strided_dispatch_table); + + // function pointers for inplace operation on contiguous inputs and output + using fn_ns::BitwiseLeftShiftInplaceContigFactory; + DispatchTableBuilder + dtb5; + dtb5.populate_dispatch_table( + bitwise_left_shift_inplace_contig_dispatch_table); + + // which types are supported by the in-place kernels + using fn_ns::BitwiseLeftShiftInplaceTypeMapFactory; + DispatchTableBuilder + dtb6; + dtb6.populate_dispatch_table(bitwise_left_shift_inplace_output_id_table); +}; + +} // namespace impl + +void init_bitwise_left_shift(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_bitwise_left_shift_dispatch_tables(); + using impl::bitwise_left_shift_contig_dispatch_table; + using impl::bitwise_left_shift_output_id_table; + using impl::bitwise_left_shift_strided_dispatch_table; + + auto bitwise_left_shift_pyapi = [&](const arrayT &src1, + const arrayT &src2, + const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_binary_ufunc( + src1, src2, dst, exec_q, depends, + bitwise_left_shift_output_id_table, + // function pointers to handle operation on contiguous arrays + // (pointers may be nullptr) + bitwise_left_shift_contig_dispatch_table, + // function pointers to handle operation on strided arrays (most + // general case) + bitwise_left_shift_strided_dispatch_table, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{}, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{}); + }; + auto bitwise_left_shift_result_type_pyapi = + [&](const py::dtype &dtype1, const py::dtype &dtype2) { + return py_binary_ufunc_result_type( + dtype1, dtype2, bitwise_left_shift_output_id_table); + }; + m.def("_bitwise_left_shift", bitwise_left_shift_pyapi, "", + py::arg("src1"), py::arg("src2"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + m.def("_bitwise_left_shift_result_type", + bitwise_left_shift_result_type_pyapi, ""); + + using impl::bitwise_left_shift_inplace_contig_dispatch_table; + using impl::bitwise_left_shift_inplace_output_id_table; + using impl::bitwise_left_shift_inplace_strided_dispatch_table; + + auto bitwise_left_shift_inplace_pyapi = + [&](const arrayT &src, const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_binary_inplace_ufunc( + src, dst, exec_q, depends, + bitwise_left_shift_inplace_output_id_table, + // function pointers to handle inplace operation on + // contiguous arrays (pointers may be nullptr) + bitwise_left_shift_inplace_contig_dispatch_table, + // function pointers to handle inplace operation on strided + // arrays (most general case) + bitwise_left_shift_inplace_strided_dispatch_table, + // function pointers to handle inplace operation on + // c-contig matrix with c-contig row with broadcasting + // (may be nullptr) + td_ns::NullPtrTable< + binary_inplace_row_matrix_broadcast_impl_fn_ptr_t>{}); + }; + m.def("_bitwise_left_shift_inplace", bitwise_left_shift_inplace_pyapi, + "", py::arg("lhs"), py::arg("rhs"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + } +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/bitwise_left_shift.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/bitwise_left_shift.hpp new file mode 100644 index 000000000000..49a7947d98c3 --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/bitwise_left_shift.hpp @@ -0,0 +1,46 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern void init_bitwise_left_shift(py::module_ m); + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/bitwise_or.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/bitwise_or.cpp new file mode 100644 index 000000000000..bbb138c406fb --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/bitwise_or.cpp @@ -0,0 +1,206 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include +#include + +#include "bitwise_or.hpp" +#include "elementwise_functions.hpp" +#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" + +#include "kernels/elementwise_functions/bitwise_or.hpp" +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/common_inplace.hpp" + +namespace dpctl::tensor::py_internal +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::binary_contig_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_strided_impl_fn_ptr_t; + +using ew_cmn_ns::binary_inplace_contig_impl_fn_ptr_t; +using ew_cmn_ns::binary_inplace_row_matrix_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_inplace_strided_impl_fn_ptr_t; + +// B05: ===== BITWISE_OR (x1, x2) +namespace impl +{ +namespace bitwise_or_fn_ns = dpctl::tensor::kernels::bitwise_or; + +static binary_contig_impl_fn_ptr_t + bitwise_or_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; + +static int bitwise_or_output_id_table[td_ns::num_types][td_ns::num_types]; +static int bitwise_or_inplace_output_id_table[td_ns::num_types] + [td_ns::num_types]; + +static binary_strided_impl_fn_ptr_t + bitwise_or_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; + +static binary_inplace_contig_impl_fn_ptr_t + bitwise_or_inplace_contig_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static binary_inplace_strided_impl_fn_ptr_t + bitwise_or_inplace_strided_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +void populate_bitwise_or_dispatch_tables(void) +{ + using namespace td_ns; + namespace fn_ns = bitwise_or_fn_ns; + + // which input types are supported, and what is the type of the result + using fn_ns::BitwiseOrTypeMapFactory; + DispatchTableBuilder dtb1; + dtb1.populate_dispatch_table(bitwise_or_output_id_table); + + // function pointers for operation on general strided arrays + using fn_ns::BitwiseOrStridedFactory; + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(bitwise_or_strided_dispatch_table); + + // function pointers for operation on contiguous inputs and output + using fn_ns::BitwiseOrContigFactory; + DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table(bitwise_or_contig_dispatch_table); + + // function pointers for inplace operation on general strided arrays + using fn_ns::BitwiseOrInplaceStridedFactory; + DispatchTableBuilder + dtb4; + dtb4.populate_dispatch_table(bitwise_or_inplace_strided_dispatch_table); + + // function pointers for inplace operation on contiguous inputs and output + using fn_ns::BitwiseOrInplaceContigFactory; + DispatchTableBuilder + dtb5; + dtb5.populate_dispatch_table(bitwise_or_inplace_contig_dispatch_table); + + // which types are supported by the in-place kernels + using fn_ns::BitwiseOrInplaceTypeMapFactory; + DispatchTableBuilder dtb6; + dtb6.populate_dispatch_table(bitwise_or_inplace_output_id_table); +}; + +} // namespace impl + +void init_bitwise_or(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_bitwise_or_dispatch_tables(); + using impl::bitwise_or_contig_dispatch_table; + using impl::bitwise_or_output_id_table; + using impl::bitwise_or_strided_dispatch_table; + + auto bitwise_or_pyapi = [&](const arrayT &src1, const arrayT &src2, + const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_binary_ufunc( + src1, src2, dst, exec_q, depends, bitwise_or_output_id_table, + // function pointers to handle operation on contiguous arrays + // (pointers may be nullptr) + bitwise_or_contig_dispatch_table, + // function pointers to handle operation on strided arrays (most + // general case) + bitwise_or_strided_dispatch_table, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{}, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{}); + }; + auto bitwise_or_result_type_pyapi = [&](const py::dtype &dtype1, + const py::dtype &dtype2) { + return py_binary_ufunc_result_type(dtype1, dtype2, + bitwise_or_output_id_table); + }; + m.def("_bitwise_or", bitwise_or_pyapi, "", py::arg("src1"), + py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + m.def("_bitwise_or_result_type", bitwise_or_result_type_pyapi, ""); + + using impl::bitwise_or_inplace_contig_dispatch_table; + using impl::bitwise_or_inplace_output_id_table; + using impl::bitwise_or_inplace_strided_dispatch_table; + + auto bitwise_or_inplace_pyapi = [&](const arrayT &src, + const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_binary_inplace_ufunc( + src, dst, exec_q, depends, bitwise_or_inplace_output_id_table, + // function pointers to handle inplace operation on + // contiguous arrays (pointers may be nullptr) + bitwise_or_inplace_contig_dispatch_table, + // function pointers to handle inplace operation on strided + // arrays (most general case) + bitwise_or_inplace_strided_dispatch_table, + // function pointers to handle inplace operation on + // c-contig matrix with c-contig row with broadcasting + // (may be nullptr) + td_ns::NullPtrTable< + binary_inplace_row_matrix_broadcast_impl_fn_ptr_t>{}); + }; + m.def("_bitwise_or_inplace", bitwise_or_inplace_pyapi, "", + py::arg("lhs"), py::arg("rhs"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + } +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/bitwise_or.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/bitwise_or.hpp new file mode 100644 index 000000000000..1e24caa54429 --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/bitwise_or.hpp @@ -0,0 +1,46 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern void init_bitwise_or(py::module_ m); + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/bitwise_right_shift.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/bitwise_right_shift.cpp new file mode 100644 index 000000000000..099dd56b4484 --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/bitwise_right_shift.cpp @@ -0,0 +1,217 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include +#include + +#include "bitwise_right_shift.hpp" +#include "elementwise_functions.hpp" +#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" + +#include "kernels/elementwise_functions/bitwise_right_shift.hpp" +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/common_inplace.hpp" + +namespace dpctl::tensor::py_internal +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::binary_contig_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_strided_impl_fn_ptr_t; + +using ew_cmn_ns::binary_inplace_contig_impl_fn_ptr_t; +using ew_cmn_ns::binary_inplace_row_matrix_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_inplace_strided_impl_fn_ptr_t; + +// B06: ===== BITWISE_RIGHT_SHIFT (x1, x2) +namespace impl +{ +namespace bitwise_right_shift_fn_ns = + dpctl::tensor::kernels::bitwise_right_shift; + +static binary_contig_impl_fn_ptr_t + bitwise_right_shift_contig_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +static int bitwise_right_shift_output_id_table[td_ns::num_types] + [td_ns::num_types]; +static int bitwise_right_shift_inplace_output_id_table[td_ns::num_types] + [td_ns::num_types]; + +static binary_strided_impl_fn_ptr_t + bitwise_right_shift_strided_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +static binary_inplace_contig_impl_fn_ptr_t + bitwise_right_shift_inplace_contig_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static binary_inplace_strided_impl_fn_ptr_t + bitwise_right_shift_inplace_strided_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +void populate_bitwise_right_shift_dispatch_tables(void) +{ + using namespace td_ns; + namespace fn_ns = bitwise_right_shift_fn_ns; + + // which input types are supported, and what is the type of the result + using fn_ns::BitwiseRightShiftTypeMapFactory; + DispatchTableBuilder dtb1; + dtb1.populate_dispatch_table(bitwise_right_shift_output_id_table); + + // function pointers for operation on general strided arrays + using fn_ns::BitwiseRightShiftStridedFactory; + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(bitwise_right_shift_strided_dispatch_table); + + // function pointers for operation on contiguous inputs and output + using fn_ns::BitwiseRightShiftContigFactory; + DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table(bitwise_right_shift_contig_dispatch_table); + + // function pointers for inplace operation on general strided arrays + using fn_ns::BitwiseRightShiftInplaceStridedFactory; + DispatchTableBuilder + dtb4; + dtb4.populate_dispatch_table( + bitwise_right_shift_inplace_strided_dispatch_table); + + // function pointers for inplace operation on contiguous inputs and output + using fn_ns::BitwiseRightShiftInplaceContigFactory; + DispatchTableBuilder + dtb5; + dtb5.populate_dispatch_table( + bitwise_right_shift_inplace_contig_dispatch_table); + + // which types are supported by the in-place kernels + using fn_ns::BitwiseRightShiftInplaceTypeMapFactory; + DispatchTableBuilder + dtb6; + dtb6.populate_dispatch_table(bitwise_right_shift_inplace_output_id_table); +}; + +} // namespace impl + +void init_bitwise_right_shift(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_bitwise_right_shift_dispatch_tables(); + using impl::bitwise_right_shift_contig_dispatch_table; + using impl::bitwise_right_shift_output_id_table; + using impl::bitwise_right_shift_strided_dispatch_table; + + auto bitwise_right_shift_pyapi = [&](const arrayT &src1, + const arrayT &src2, + const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_binary_ufunc( + src1, src2, dst, exec_q, depends, + bitwise_right_shift_output_id_table, + // function pointers to handle operation on contiguous arrays + // (pointers may be nullptr) + bitwise_right_shift_contig_dispatch_table, + // function pointers to handle operation on strided arrays (most + // general case) + bitwise_right_shift_strided_dispatch_table, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{}, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{}); + }; + auto bitwise_right_shift_result_type_pyapi = + [&](const py::dtype &dtype1, const py::dtype &dtype2) { + return py_binary_ufunc_result_type( + dtype1, dtype2, bitwise_right_shift_output_id_table); + }; + m.def("_bitwise_right_shift", bitwise_right_shift_pyapi, "", + py::arg("src1"), py::arg("src2"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + m.def("_bitwise_right_shift_result_type", + bitwise_right_shift_result_type_pyapi, ""); + + using impl::bitwise_right_shift_inplace_contig_dispatch_table; + using impl::bitwise_right_shift_inplace_output_id_table; + using impl::bitwise_right_shift_inplace_strided_dispatch_table; + + auto bitwise_right_shift_inplace_pyapi = + [&](const arrayT &src, const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_binary_inplace_ufunc( + src, dst, exec_q, depends, + bitwise_right_shift_inplace_output_id_table, + // function pointers to handle inplace operation on + // contiguous arrays (pointers may be nullptr) + bitwise_right_shift_inplace_contig_dispatch_table, + // function pointers to handle inplace operation on strided + // arrays (most general case) + bitwise_right_shift_inplace_strided_dispatch_table, + // function pointers to handle inplace operation on + // c-contig matrix with c-contig row with broadcasting + // (may be nullptr) + td_ns::NullPtrTable< + binary_inplace_row_matrix_broadcast_impl_fn_ptr_t>{}); + }; + m.def("_bitwise_right_shift_inplace", bitwise_right_shift_inplace_pyapi, + "", py::arg("lhs"), py::arg("rhs"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + } +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/bitwise_right_shift.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/bitwise_right_shift.hpp new file mode 100644 index 000000000000..aeb24d73b2fc --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/bitwise_right_shift.hpp @@ -0,0 +1,46 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern void init_bitwise_right_shift(py::module_ m); + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/bitwise_xor.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/bitwise_xor.cpp new file mode 100644 index 000000000000..9a23fec82e72 --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/bitwise_xor.cpp @@ -0,0 +1,206 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include +#include + +#include "bitwise_xor.hpp" +#include "elementwise_functions.hpp" +#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" + +#include "kernels/elementwise_functions/bitwise_xor.hpp" +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/common_inplace.hpp" + +namespace dpctl::tensor::py_internal +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::binary_contig_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_strided_impl_fn_ptr_t; + +using ew_cmn_ns::binary_inplace_contig_impl_fn_ptr_t; +using ew_cmn_ns::binary_inplace_row_matrix_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_inplace_strided_impl_fn_ptr_t; + +// B07: ===== BITWISE_XOR (x1, x2) +namespace impl +{ +namespace bitwise_xor_fn_ns = dpctl::tensor::kernels::bitwise_xor; + +static binary_contig_impl_fn_ptr_t + bitwise_xor_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; + +static int bitwise_xor_output_id_table[td_ns::num_types][td_ns::num_types]; +static int bitwise_xor_inplace_output_id_table[td_ns::num_types] + [td_ns::num_types]; + +static binary_strided_impl_fn_ptr_t + bitwise_xor_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; + +static binary_inplace_contig_impl_fn_ptr_t + bitwise_xor_inplace_contig_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static binary_inplace_strided_impl_fn_ptr_t + bitwise_xor_inplace_strided_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +void populate_bitwise_xor_dispatch_tables(void) +{ + using namespace td_ns; + namespace fn_ns = bitwise_xor_fn_ns; + + // which input types are supported, and what is the type of the result + using fn_ns::BitwiseXorTypeMapFactory; + DispatchTableBuilder dtb1; + dtb1.populate_dispatch_table(bitwise_xor_output_id_table); + + // function pointers for operation on general strided arrays + using fn_ns::BitwiseXorStridedFactory; + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(bitwise_xor_strided_dispatch_table); + + // function pointers for operation on contiguous inputs and output + using fn_ns::BitwiseXorContigFactory; + DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table(bitwise_xor_contig_dispatch_table); + + // function pointers for inplace operation on general strided arrays + using fn_ns::BitwiseXorInplaceStridedFactory; + DispatchTableBuilder + dtb4; + dtb4.populate_dispatch_table(bitwise_xor_inplace_strided_dispatch_table); + + // function pointers for inplace operation on contiguous inputs and output + using fn_ns::BitwiseXorInplaceContigFactory; + DispatchTableBuilder + dtb5; + dtb5.populate_dispatch_table(bitwise_xor_inplace_contig_dispatch_table); + + // which types are supported by the in-place kernels + using fn_ns::BitwiseXorInplaceTypeMapFactory; + DispatchTableBuilder dtb6; + dtb6.populate_dispatch_table(bitwise_xor_inplace_output_id_table); +}; + +} // namespace impl + +void init_bitwise_xor(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_bitwise_xor_dispatch_tables(); + using impl::bitwise_xor_contig_dispatch_table; + using impl::bitwise_xor_output_id_table; + using impl::bitwise_xor_strided_dispatch_table; + + auto bitwise_xor_pyapi = [&](const arrayT &src1, const arrayT &src2, + const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_binary_ufunc( + src1, src2, dst, exec_q, depends, bitwise_xor_output_id_table, + // function pointers to handle operation on contiguous arrays + // (pointers may be nullptr) + bitwise_xor_contig_dispatch_table, + // function pointers to handle operation on strided arrays (most + // general case) + bitwise_xor_strided_dispatch_table, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{}, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{}); + }; + auto bitwise_xor_result_type_pyapi = [&](const py::dtype &dtype1, + const py::dtype &dtype2) { + return py_binary_ufunc_result_type(dtype1, dtype2, + bitwise_xor_output_id_table); + }; + m.def("_bitwise_xor", bitwise_xor_pyapi, "", py::arg("src1"), + py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + m.def("_bitwise_xor_result_type", bitwise_xor_result_type_pyapi, ""); + + using impl::bitwise_xor_inplace_contig_dispatch_table; + using impl::bitwise_xor_inplace_output_id_table; + using impl::bitwise_xor_inplace_strided_dispatch_table; + + auto bitwise_xor_inplace_pyapi = [&](const arrayT &src, + const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_binary_inplace_ufunc( + src, dst, exec_q, depends, bitwise_xor_inplace_output_id_table, + // function pointers to handle inplace operation on + // contiguous arrays (pointers may be nullptr) + bitwise_xor_inplace_contig_dispatch_table, + // function pointers to handle inplace operation on strided + // arrays (most general case) + bitwise_xor_inplace_strided_dispatch_table, + // function pointers to handle inplace operation on + // c-contig matrix with c-contig row with broadcasting + // (may be nullptr) + td_ns::NullPtrTable< + binary_inplace_row_matrix_broadcast_impl_fn_ptr_t>{}); + }; + m.def("_bitwise_xor_inplace", bitwise_xor_inplace_pyapi, "", + py::arg("lhs"), py::arg("rhs"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + } +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/bitwise_xor.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/bitwise_xor.hpp new file mode 100644 index 000000000000..4029574cdd7d --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/bitwise_xor.hpp @@ -0,0 +1,46 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern void init_bitwise_xor(py::module_ m); + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/cbrt.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/cbrt.cpp new file mode 100644 index 000000000000..a061235acfd7 --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/cbrt.cpp @@ -0,0 +1,125 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include +#include + +#include "cbrt.hpp" +#include "elementwise_functions.hpp" +#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" + +#include "kernels/elementwise_functions/cbrt.hpp" +#include "kernels/elementwise_functions/common.hpp" + +namespace dpctl::tensor::py_internal +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::unary_contig_impl_fn_ptr_t; +using ew_cmn_ns::unary_strided_impl_fn_ptr_t; + +// U37: ==== CBRT (x) +namespace impl +{ + +namespace cbrt_fn_ns = dpctl::tensor::kernels::cbrt; + +static unary_contig_impl_fn_ptr_t cbrt_contig_dispatch_vector[td_ns::num_types]; +static int cbrt_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + cbrt_strided_dispatch_vector[td_ns::num_types]; + +void populate_cbrt_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = cbrt_fn_ns; + + using fn_ns::CbrtContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(cbrt_contig_dispatch_vector); + + using fn_ns::CbrtStridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(cbrt_strided_dispatch_vector); + + using fn_ns::CbrtTypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(cbrt_output_typeid_vector); +}; + +} // namespace impl + +void init_cbrt(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_cbrt_dispatch_vectors(); + using impl::cbrt_contig_dispatch_vector; + using impl::cbrt_output_typeid_vector; + using impl::cbrt_strided_dispatch_vector; + + auto cbrt_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc( + src, dst, exec_q, depends, cbrt_output_typeid_vector, + cbrt_contig_dispatch_vector, cbrt_strided_dispatch_vector); + }; + m.def("_cbrt", cbrt_pyapi, "", py::arg("src"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto cbrt_result_type_pyapi = [&](const py::dtype &dtype) { + return py_unary_ufunc_result_type(dtype, cbrt_output_typeid_vector); + }; + m.def("_cbrt_result_type", cbrt_result_type_pyapi); + } +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/cbrt.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/cbrt.hpp new file mode 100644 index 000000000000..53757bff7134 --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/cbrt.hpp @@ -0,0 +1,46 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern void init_cbrt(py::module_ m); + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/ceil.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/ceil.cpp new file mode 100644 index 000000000000..4c4604e31692 --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/ceil.cpp @@ -0,0 +1,125 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include +#include + +#include "ceil.hpp" +#include "elementwise_functions.hpp" +#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" + +#include "kernels/elementwise_functions/ceil.hpp" +#include "kernels/elementwise_functions/common.hpp" + +namespace dpctl::tensor::py_internal +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::unary_contig_impl_fn_ptr_t; +using ew_cmn_ns::unary_strided_impl_fn_ptr_t; + +// U09: ==== CEIL (x) +namespace impl +{ + +namespace ceil_fn_ns = dpctl::tensor::kernels::ceil; + +static unary_contig_impl_fn_ptr_t ceil_contig_dispatch_vector[td_ns::num_types]; +static int ceil_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + ceil_strided_dispatch_vector[td_ns::num_types]; + +void populate_ceil_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = ceil_fn_ns; + + using fn_ns::CeilContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(ceil_contig_dispatch_vector); + + using fn_ns::CeilStridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(ceil_strided_dispatch_vector); + + using fn_ns::CeilTypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(ceil_output_typeid_vector); +}; + +} // namespace impl + +void init_ceil(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_ceil_dispatch_vectors(); + using impl::ceil_contig_dispatch_vector; + using impl::ceil_output_typeid_vector; + using impl::ceil_strided_dispatch_vector; + + auto ceil_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc( + src, dst, exec_q, depends, ceil_output_typeid_vector, + ceil_contig_dispatch_vector, ceil_strided_dispatch_vector); + }; + m.def("_ceil", ceil_pyapi, "", py::arg("src"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto ceil_result_type_pyapi = [&](const py::dtype &dtype) { + return py_unary_ufunc_result_type(dtype, ceil_output_typeid_vector); + }; + m.def("_ceil_result_type", ceil_result_type_pyapi); + } +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/ceil.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/ceil.hpp new file mode 100644 index 000000000000..436cb5f89b2b --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/ceil.hpp @@ -0,0 +1,46 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern void init_ceil(py::module_ m); + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/conj.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/conj.cpp new file mode 100644 index 000000000000..cee977f719f4 --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/conj.cpp @@ -0,0 +1,125 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include +#include + +#include "conj.hpp" +#include "elementwise_functions.hpp" +#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/conj.hpp" + +namespace dpctl::tensor::py_internal +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::unary_contig_impl_fn_ptr_t; +using ew_cmn_ns::unary_strided_impl_fn_ptr_t; + +// U10: ==== CONJ (x) +namespace impl +{ + +namespace conj_fn_ns = dpctl::tensor::kernels::conj; + +static unary_contig_impl_fn_ptr_t conj_contig_dispatch_vector[td_ns::num_types]; +static int conj_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + conj_strided_dispatch_vector[td_ns::num_types]; + +void populate_conj_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = conj_fn_ns; + + using fn_ns::ConjContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(conj_contig_dispatch_vector); + + using fn_ns::ConjStridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(conj_strided_dispatch_vector); + + using fn_ns::ConjTypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(conj_output_typeid_vector); +}; + +} // namespace impl + +void init_conj(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_conj_dispatch_vectors(); + using impl::conj_contig_dispatch_vector; + using impl::conj_output_typeid_vector; + using impl::conj_strided_dispatch_vector; + + auto conj_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc( + src, dst, exec_q, depends, conj_output_typeid_vector, + conj_contig_dispatch_vector, conj_strided_dispatch_vector); + }; + m.def("_conj", conj_pyapi, "", py::arg("src"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto conj_result_type_pyapi = [&](const py::dtype &dtype) { + return py_unary_ufunc_result_type(dtype, conj_output_typeid_vector); + }; + m.def("_conj_result_type", conj_result_type_pyapi); + } +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/conj.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/conj.hpp new file mode 100644 index 000000000000..4c0aeb17260b --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/conj.hpp @@ -0,0 +1,46 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern void init_conj(py::module_ m); + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/copysign.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/copysign.cpp new file mode 100644 index 000000000000..8dca1635459a --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/copysign.cpp @@ -0,0 +1,146 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include +#include + +#include "copysign.hpp" +#include "elementwise_functions.hpp" +#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/copysign.hpp" + +namespace dpctl::tensor::py_internal +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::binary_contig_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_strided_impl_fn_ptr_t; + +// B25: ===== COPYSIGN (x1, x2) +namespace impl +{ +namespace copysign_fn_ns = dpctl::tensor::kernels::copysign; + +static binary_contig_impl_fn_ptr_t + copysign_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; +static int copysign_output_id_table[td_ns::num_types][td_ns::num_types]; + +static binary_strided_impl_fn_ptr_t + copysign_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; + +void populate_copysign_dispatch_tables(void) +{ + using namespace td_ns; + namespace fn_ns = copysign_fn_ns; + + // which input types are supported, and what is the type of the result + using fn_ns::CopysignTypeMapFactory; + DispatchTableBuilder dtb1; + dtb1.populate_dispatch_table(copysign_output_id_table); + + // function pointers for operation on general strided arrays + using fn_ns::CopysignStridedFactory; + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(copysign_strided_dispatch_table); + + // function pointers for operation on contiguous inputs and output + using fn_ns::CopysignContigFactory; + DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table(copysign_contig_dispatch_table); +}; + +} // namespace impl + +void init_copysign(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_copysign_dispatch_tables(); + using impl::copysign_contig_dispatch_table; + using impl::copysign_output_id_table; + using impl::copysign_strided_dispatch_table; + + auto copysign_pyapi = [&](const arrayT &src1, const arrayT &src2, + const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_binary_ufunc( + src1, src2, dst, exec_q, depends, copysign_output_id_table, + // function pointers to handle operation on contiguous arrays + // (pointers may be nullptr) + copysign_contig_dispatch_table, + // function pointers to handle operation on strided arrays (most + // general case) + copysign_strided_dispatch_table, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{}, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{}); + }; + auto copysign_result_type_pyapi = [&](const py::dtype &dtype1, + const py::dtype &dtype2) { + return py_binary_ufunc_result_type(dtype1, dtype2, + copysign_output_id_table); + }; + m.def("_copysign", copysign_pyapi, "", py::arg("src1"), py::arg("src2"), + py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + m.def("_copysign_result_type", copysign_result_type_pyapi, ""); + } +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/copysign.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/copysign.hpp new file mode 100644 index 000000000000..875443d792c2 --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/copysign.hpp @@ -0,0 +1,46 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern void init_copysign(py::module_ m); + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/cos.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/cos.cpp new file mode 100644 index 000000000000..966364c8b8c0 --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/cos.cpp @@ -0,0 +1,125 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include +#include + +#include "cos.hpp" +#include "elementwise_functions.hpp" +#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/cos.hpp" + +namespace dpctl::tensor::py_internal +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::unary_contig_impl_fn_ptr_t; +using ew_cmn_ns::unary_strided_impl_fn_ptr_t; + +// U11: ==== COS (x) +namespace impl +{ + +namespace cos_fn_ns = dpctl::tensor::kernels::cos; + +static unary_contig_impl_fn_ptr_t cos_contig_dispatch_vector[td_ns::num_types]; +static int cos_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + cos_strided_dispatch_vector[td_ns::num_types]; + +void populate_cos_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = cos_fn_ns; + + using fn_ns::CosContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(cos_contig_dispatch_vector); + + using fn_ns::CosStridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(cos_strided_dispatch_vector); + + using fn_ns::CosTypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(cos_output_typeid_vector); +}; + +} // namespace impl + +void init_cos(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_cos_dispatch_vectors(); + using impl::cos_contig_dispatch_vector; + using impl::cos_output_typeid_vector; + using impl::cos_strided_dispatch_vector; + + auto cos_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc( + src, dst, exec_q, depends, cos_output_typeid_vector, + cos_contig_dispatch_vector, cos_strided_dispatch_vector); + }; + m.def("_cos", cos_pyapi, "", py::arg("src"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto cos_result_type_pyapi = [&](const py::dtype &dtype) { + return py_unary_ufunc_result_type(dtype, cos_output_typeid_vector); + }; + m.def("_cos_result_type", cos_result_type_pyapi); + } +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/cos.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/cos.hpp new file mode 100644 index 000000000000..4b9ab341a355 --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/cos.hpp @@ -0,0 +1,46 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern void init_cos(py::module_ m); + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/cosh.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/cosh.cpp new file mode 100644 index 000000000000..54fc5d57e4df --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/cosh.cpp @@ -0,0 +1,125 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include +#include + +#include "cosh.hpp" +#include "elementwise_functions.hpp" +#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/cosh.hpp" + +namespace dpctl::tensor::py_internal +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::unary_contig_impl_fn_ptr_t; +using ew_cmn_ns::unary_strided_impl_fn_ptr_t; + +// U12: ==== COSH (x) +namespace impl +{ + +namespace cosh_fn_ns = dpctl::tensor::kernels::cosh; + +static unary_contig_impl_fn_ptr_t cosh_contig_dispatch_vector[td_ns::num_types]; +static int cosh_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + cosh_strided_dispatch_vector[td_ns::num_types]; + +void populate_cosh_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = cosh_fn_ns; + + using fn_ns::CoshContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(cosh_contig_dispatch_vector); + + using fn_ns::CoshStridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(cosh_strided_dispatch_vector); + + using fn_ns::CoshTypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(cosh_output_typeid_vector); +}; + +} // namespace impl + +void init_cosh(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_cosh_dispatch_vectors(); + using impl::cosh_contig_dispatch_vector; + using impl::cosh_output_typeid_vector; + using impl::cosh_strided_dispatch_vector; + + auto cosh_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc( + src, dst, exec_q, depends, cosh_output_typeid_vector, + cosh_contig_dispatch_vector, cosh_strided_dispatch_vector); + }; + m.def("_cosh", cosh_pyapi, "", py::arg("src"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto cosh_result_type_pyapi = [&](const py::dtype &dtype) { + return py_unary_ufunc_result_type(dtype, cosh_output_typeid_vector); + }; + m.def("_cosh_result_type", cosh_result_type_pyapi); + } +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/cosh.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/cosh.hpp new file mode 100644 index 000000000000..6ddfe5643b54 --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/cosh.hpp @@ -0,0 +1,46 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern void init_cosh(py::module_ m); + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/elementwise_common.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/elementwise_common.cpp new file mode 100644 index 000000000000..dc09318d66ad --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/elementwise_common.cpp @@ -0,0 +1,191 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#include + +#include "abs.hpp" +#include "acos.hpp" +#include "acosh.hpp" +#include "add.hpp" +#include "angle.hpp" +#include "asin.hpp" +#include "asinh.hpp" +#include "atan.hpp" +#include "atan2.hpp" +#include "atanh.hpp" +#include "bitwise_and.hpp" +#include "bitwise_invert.hpp" +#include "bitwise_left_shift.hpp" +#include "bitwise_or.hpp" +#include "bitwise_right_shift.hpp" +#include "bitwise_xor.hpp" +#include "cbrt.hpp" +#include "ceil.hpp" +#include "conj.hpp" +#include "copysign.hpp" +#include "cos.hpp" +#include "cosh.hpp" +#include "equal.hpp" +#include "exp.hpp" +#include "exp2.hpp" +#include "expm1.hpp" +#include "floor.hpp" +#include "floor_divide.hpp" +#include "greater.hpp" +#include "greater_equal.hpp" +#include "hypot.hpp" +#include "imag.hpp" +#include "isfinite.hpp" +#include "isinf.hpp" +#include "isnan.hpp" +#include "less.hpp" +#include "less_equal.hpp" +#include "log.hpp" +#include "log10.hpp" +#include "log1p.hpp" +#include "log2.hpp" +#include "logaddexp.hpp" +#include "logical_and.hpp" +#include "logical_not.hpp" +#include "logical_or.hpp" +#include "logical_xor.hpp" +#include "maximum.hpp" +#include "minimum.hpp" +#include "multiply.hpp" +#include "negative.hpp" +#include "nextafter.hpp" +#include "not_equal.hpp" +#include "positive.hpp" +#include "pow.hpp" +#include "proj.hpp" +#include "real.hpp" +#include "reciprocal.hpp" +#include "remainder.hpp" +#include "round.hpp" +#include "rsqrt.hpp" +#include "sign.hpp" +#include "signbit.hpp" +#include "sin.hpp" +#include "sinh.hpp" +#include "sqrt.hpp" +#include "square.hpp" +#include "subtract.hpp" +#include "tan.hpp" +#include "tanh.hpp" +#include "true_divide.hpp" +#include "trunc.hpp" + +namespace dpctl::tensor::py_internal +{ + +namespace py = pybind11; + +/*! @brief Add elementwise functions to Python module */ +void init_elementwise_functions(py::module_ m) +{ + init_abs(m); + init_acos(m); + init_acosh(m); + init_add(m); + init_angle(m); + init_asin(m); + init_asinh(m); + init_atan(m); + init_atan2(m); + init_atanh(m); + init_bitwise_and(m); + init_bitwise_invert(m); + init_bitwise_left_shift(m); + init_bitwise_or(m); + init_bitwise_right_shift(m); + init_bitwise_xor(m); + init_cbrt(m); + init_ceil(m); + init_conj(m); + init_copysign(m); + init_cos(m); + init_cosh(m); + init_divide(m); + init_equal(m); + init_exp(m); + init_exp2(m); + init_expm1(m); + init_floor(m); + init_floor_divide(m); + init_greater(m); + init_greater_equal(m); + init_hypot(m); + init_imag(m); + init_isfinite(m); + init_isinf(m); + init_isnan(m); + init_less(m); + init_less_equal(m); + init_log(m); + init_log10(m); + init_log1p(m); + init_log2(m); + init_logaddexp(m); + init_logical_and(m); + init_logical_not(m); + init_logical_or(m); + init_logical_xor(m); + init_maximum(m); + init_minimum(m); + init_multiply(m); + init_nextafter(m); + init_negative(m); + init_not_equal(m); + init_positive(m); + init_pow(m); + init_proj(m); + init_real(m); + init_reciprocal(m); + init_remainder(m); + init_round(m); + init_rsqrt(m); + init_sign(m); + init_signbit(m); + init_sin(m); + init_sinh(m); + init_sqrt(m); + init_square(m); + init_subtract(m); + init_tan(m); + init_tanh(m); + init_trunc(m); +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/elementwise_common.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/elementwise_common.hpp new file mode 100644 index 000000000000..0c385f2d15a5 --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/elementwise_common.hpp @@ -0,0 +1,46 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern void init_elementwise_functions(py::module_); + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/elementwise_functions.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/elementwise_functions.hpp new file mode 100644 index 000000000000..3a8dc6bfb56f --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/elementwise_functions.hpp @@ -0,0 +1,807 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include + +#include "elementwise_functions_type_utils.hpp" +#include "kernels/alignment.hpp" +#include "kernels/dpctl_tensor_types.hpp" +#include "simplify_iteration_space.hpp" +#include "utils/memory_overlap.hpp" +#include "utils/offset_utils.hpp" +#include "utils/output_validation.hpp" +#include "utils/sycl_alloc_utils.hpp" +#include "utils/type_dispatch.hpp" + +static_assert(std::is_same_v); + +namespace dpctl::tensor::py_internal +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +using dpctl::tensor::kernels::alignment_utils::is_aligned; +using dpctl::tensor::kernels::alignment_utils::required_alignment; + +/*! @brief Template implementing Python API for unary elementwise functions */ +template +std::pair + py_unary_ufunc(const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &q, + const std::vector &depends, + // + const output_typesT &output_type_vec, + const contig_dispatchT &contig_dispatch_vector, + const strided_dispatchT &strided_dispatch_vector) +{ + int src_typenum = src.get_typenum(); + int dst_typenum = dst.get_typenum(); + + const auto &array_types = td_ns::usm_ndarray_types(); + int src_typeid = array_types.typenum_to_lookup_id(src_typenum); + int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum); + + int func_output_typeid = output_type_vec[src_typeid]; + + // check that types are supported + if (dst_typeid != func_output_typeid) { + throw py::value_error( + "Destination array has unexpected elemental data type."); + } + + // check that queues are compatible + if (!dpctl::utils::queues_are_compatible(q, {src, dst})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst); + + // check that dimensions are the same + int src_nd = src.get_ndim(); + if (src_nd != dst.get_ndim()) { + throw py::value_error("Array dimensions are not the same."); + } + + // check that shapes are the same + const py::ssize_t *src_shape = src.get_shape_raw(); + const py::ssize_t *dst_shape = dst.get_shape_raw(); + bool shapes_equal(true); + std::size_t src_nelems(1); + + for (int i = 0; i < src_nd; ++i) { + src_nelems *= static_cast(src_shape[i]); + shapes_equal = shapes_equal && (src_shape[i] == dst_shape[i]); + } + if (!shapes_equal) { + throw py::value_error("Array shapes are not the same."); + } + + // if nelems is zero, return + if (src_nelems == 0) { + return std::make_pair(sycl::event(), sycl::event()); + } + + dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(dst, src_nelems); + + // check memory overlap + auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); + auto const &same_logical_tensors = + dpctl::tensor::overlap::SameLogicalTensors(); + if (overlap(src, dst) && !same_logical_tensors(src, dst)) { + throw py::value_error("Arrays index overlapping segments of memory"); + } + + const char *src_data = src.get_data(); + char *dst_data = dst.get_data(); + + // handle contiguous inputs + bool is_src_c_contig = src.is_c_contiguous(); + bool is_src_f_contig = src.is_f_contiguous(); + + bool is_dst_c_contig = dst.is_c_contiguous(); + bool is_dst_f_contig = dst.is_f_contiguous(); + + bool both_c_contig = (is_src_c_contig && is_dst_c_contig); + bool both_f_contig = (is_src_f_contig && is_dst_f_contig); + + if (both_c_contig || both_f_contig) { + auto contig_fn = contig_dispatch_vector[src_typeid]; + + if (contig_fn == nullptr) { + throw std::runtime_error( + "Contiguous implementation is missing for src_typeid=" + + std::to_string(src_typeid)); + } + + auto comp_ev = contig_fn(q, src_nelems, src_data, dst_data, depends); + sycl::event ht_ev = + dpctl::utils::keep_args_alive(q, {src, dst}, {comp_ev}); + + return std::make_pair(ht_ev, comp_ev); + } + + // simplify iteration space + // if 1d with strides 1 - input is contig + // dispatch to strided + + auto const &src_strides = src.get_strides_vector(); + auto const &dst_strides = dst.get_strides_vector(); + + using shT = std::vector; + shT simplified_shape; + shT simplified_src_strides; + shT simplified_dst_strides; + py::ssize_t src_offset(0); + py::ssize_t dst_offset(0); + + int nd = src_nd; + const py::ssize_t *shape = src_shape; + + simplify_iteration_space(nd, shape, src_strides, dst_strides, + // output + simplified_shape, simplified_src_strides, + simplified_dst_strides, src_offset, dst_offset); + + if (nd == 1 && simplified_src_strides[0] == 1 && + simplified_dst_strides[0] == 1) { + // Special case of contiguous data + auto contig_fn = contig_dispatch_vector[src_typeid]; + + if (contig_fn == nullptr) { + throw std::runtime_error( + "Contiguous implementation is missing for src_typeid=" + + std::to_string(src_typeid)); + } + + int src_elem_size = src.get_elemsize(); + int dst_elem_size = dst.get_elemsize(); + auto comp_ev = + contig_fn(q, src_nelems, src_data + src_elem_size * src_offset, + dst_data + dst_elem_size * dst_offset, depends); + + sycl::event ht_ev = + dpctl::utils::keep_args_alive(q, {src, dst}, {comp_ev}); + + return std::make_pair(ht_ev, comp_ev); + } + + // Strided implementation + auto strided_fn = strided_dispatch_vector[src_typeid]; + + if (strided_fn == nullptr) { + throw std::runtime_error( + "Strided implementation is missing for src_typeid=" + + std::to_string(src_typeid)); + } + + using dpctl::tensor::offset_utils::device_allocate_and_pack; + + std::vector host_tasks{}; + host_tasks.reserve(2); + + auto ptr_size_event_triple_ = device_allocate_and_pack( + q, host_tasks, simplified_shape, simplified_src_strides, + simplified_dst_strides); + auto shape_strides_owner = std::move(std::get<0>(ptr_size_event_triple_)); + const auto ©_shape_ev = std::get<2>(ptr_size_event_triple_); + const py::ssize_t *shape_strides = shape_strides_owner.get(); + + sycl::event strided_fn_ev = + strided_fn(q, src_nelems, nd, shape_strides, src_data, src_offset, + dst_data, dst_offset, depends, {copy_shape_ev}); + + // async free of shape_strides temporary + sycl::event tmp_cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free( + q, {strided_fn_ev}, shape_strides_owner); + + host_tasks.push_back(tmp_cleanup_ev); + + return std::make_pair( + dpctl::utils::keep_args_alive(q, {src, dst}, host_tasks), + strided_fn_ev); +} + +/*! @brief Template implementing Python API for querying of type support by + * unary elementwise functions */ +template +py::object py_unary_ufunc_result_type(const py::dtype &input_dtype, + const output_typesT &output_types) +{ + int tn = input_dtype.num(); // NumPy type numbers are the same as in dpctl + int src_typeid = -1; + + auto array_types = td_ns::usm_ndarray_types(); + + try { + src_typeid = array_types.typenum_to_lookup_id(tn); + } catch (const std::exception &e) { + throw py::value_error(e.what()); + } + + using type_utils::_result_typeid; + int dst_typeid = _result_typeid(src_typeid, output_types); + + if (dst_typeid < 0) { + auto res = py::none(); + return py::cast(res); + } + else { + using type_utils::_dtype_from_typenum; + + auto dst_typenum_t = static_cast(dst_typeid); + auto dt = _dtype_from_typenum(dst_typenum_t); + + return py::cast(dt); + } +} + +// ======================== Binary functions =========================== + +namespace +{ +template +bool isEqual(Container const &c, std::initializer_list const &l) +{ + return std::equal(std::begin(c), std::end(c), std::begin(l), std::end(l)); +} +} // namespace + +/*! @brief Template implementing Python API for binary elementwise + * functions */ +template +std::pair py_binary_ufunc( + const dpctl::tensor::usm_ndarray &src1, + const dpctl::tensor::usm_ndarray &src2, + const dpctl::tensor::usm_ndarray &dst, // dst = op(src1, src2), elementwise + sycl::queue &exec_q, + const std::vector depends, + // + const output_typesT &output_type_table, + const contig_dispatchT &contig_dispatch_table, + const strided_dispatchT &strided_dispatch_table, + const contig_matrix_row_dispatchT + &contig_matrix_row_broadcast_dispatch_table, + const contig_row_matrix_dispatchT + &contig_row_matrix_broadcast_dispatch_table) +{ + // check type_nums + int src1_typenum = src1.get_typenum(); + int src2_typenum = src2.get_typenum(); + int dst_typenum = dst.get_typenum(); + + auto array_types = td_ns::usm_ndarray_types(); + int src1_typeid = array_types.typenum_to_lookup_id(src1_typenum); + int src2_typeid = array_types.typenum_to_lookup_id(src2_typenum); + int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum); + + int output_typeid = output_type_table[src1_typeid][src2_typeid]; + + if (output_typeid != dst_typeid) { + throw py::value_error( + "Destination array has unexpected elemental data type."); + } + + // check that queues are compatible + if (!dpctl::utils::queues_are_compatible(exec_q, {src1, src2, dst})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst); + + // check shapes, broadcasting is assumed done by caller + // check that dimensions are the same + int dst_nd = dst.get_ndim(); + if (dst_nd != src1.get_ndim() || dst_nd != src2.get_ndim()) { + throw py::value_error("Array dimensions are not the same."); + } + + // check that shapes are the same + const py::ssize_t *src1_shape = src1.get_shape_raw(); + const py::ssize_t *src2_shape = src2.get_shape_raw(); + const py::ssize_t *dst_shape = dst.get_shape_raw(); + bool shapes_equal(true); + std::size_t src_nelems(1); + + for (int i = 0; i < dst_nd; ++i) { + src_nelems *= static_cast(src1_shape[i]); + shapes_equal = shapes_equal && (src1_shape[i] == dst_shape[i] && + src2_shape[i] == dst_shape[i]); + } + if (!shapes_equal) { + throw py::value_error("Array shapes are not the same."); + } + + // if nelems is zero, return + if (src_nelems == 0) { + return std::make_pair(sycl::event(), sycl::event()); + } + + dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(dst, src_nelems); + + auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); + auto const &same_logical_tensors = + dpctl::tensor::overlap::SameLogicalTensors(); + if ((overlap(src1, dst) && !same_logical_tensors(src1, dst)) || + (overlap(src2, dst) && !same_logical_tensors(src2, dst))) { + throw py::value_error("Arrays index overlapping segments of memory"); + } + // check memory overlap + const char *src1_data = src1.get_data(); + const char *src2_data = src2.get_data(); + char *dst_data = dst.get_data(); + + // handle contiguous inputs + bool is_src1_c_contig = src1.is_c_contiguous(); + bool is_src1_f_contig = src1.is_f_contiguous(); + + bool is_src2_c_contig = src2.is_c_contiguous(); + bool is_src2_f_contig = src2.is_f_contiguous(); + + bool is_dst_c_contig = dst.is_c_contiguous(); + bool is_dst_f_contig = dst.is_f_contiguous(); + + bool all_c_contig = + (is_src1_c_contig && is_src2_c_contig && is_dst_c_contig); + bool all_f_contig = + (is_src1_f_contig && is_src2_f_contig && is_dst_f_contig); + + // dispatch for contiguous inputs + if (all_c_contig || all_f_contig) { + auto contig_fn = contig_dispatch_table[src1_typeid][src2_typeid]; + + if (contig_fn != nullptr) { + auto comp_ev = contig_fn(exec_q, src_nelems, src1_data, 0, + src2_data, 0, dst_data, 0, depends); + sycl::event ht_ev = dpctl::utils::keep_args_alive( + exec_q, {src1, src2, dst}, {comp_ev}); + + return std::make_pair(ht_ev, comp_ev); + } + } + + // simplify strides + auto const &src1_strides = src1.get_strides_vector(); + auto const &src2_strides = src2.get_strides_vector(); + auto const &dst_strides = dst.get_strides_vector(); + + using shT = std::vector; + shT simplified_shape; + shT simplified_src1_strides; + shT simplified_src2_strides; + shT simplified_dst_strides; + py::ssize_t src1_offset(0); + py::ssize_t src2_offset(0); + py::ssize_t dst_offset(0); + + int nd = dst_nd; + const py::ssize_t *shape = src1_shape; + + simplify_iteration_space_3( + nd, shape, src1_strides, src2_strides, dst_strides, + // outputs + simplified_shape, simplified_src1_strides, simplified_src2_strides, + simplified_dst_strides, src1_offset, src2_offset, dst_offset); + + std::vector host_tasks{}; + if (nd < 3) { + static constexpr auto unit_stride = + std::initializer_list{1}; + + if ((nd == 1) && isEqual(simplified_src1_strides, unit_stride) && + isEqual(simplified_src2_strides, unit_stride) && + isEqual(simplified_dst_strides, unit_stride)) { + auto contig_fn = contig_dispatch_table[src1_typeid][src2_typeid]; + + if (contig_fn != nullptr) { + auto comp_ev = contig_fn(exec_q, src_nelems, src1_data, + src1_offset, src2_data, src2_offset, + dst_data, dst_offset, depends); + sycl::event ht_ev = dpctl::utils::keep_args_alive( + exec_q, {src1, src2, dst}, {comp_ev}); + + return std::make_pair(ht_ev, comp_ev); + } + } + if (nd == 2) { + static constexpr auto zero_one_strides = + std::initializer_list{0, 1}; + static constexpr auto one_zero_strides = + std::initializer_list{1, 0}; + static constexpr py::ssize_t one{1}; + // special case of C-contiguous matrix and a row + if (isEqual(simplified_src2_strides, zero_one_strides) && + isEqual(simplified_src1_strides, {simplified_shape[1], one}) && + isEqual(simplified_dst_strides, {simplified_shape[1], one})) { + auto matrix_row_broadcast_fn = + contig_matrix_row_broadcast_dispatch_table[src1_typeid] + [src2_typeid]; + if (matrix_row_broadcast_fn != nullptr) { + int src1_itemsize = src1.get_elemsize(); + int src2_itemsize = src2.get_elemsize(); + int dst_itemsize = dst.get_elemsize(); + + if (is_aligned( + src1_data + src1_offset * src1_itemsize) && + is_aligned( + src2_data + src2_offset * src2_itemsize) && + is_aligned( + dst_data + dst_offset * dst_itemsize)) { + std::size_t n0 = simplified_shape[0]; + std::size_t n1 = simplified_shape[1]; + sycl::event comp_ev = matrix_row_broadcast_fn( + exec_q, host_tasks, n0, n1, src1_data, src1_offset, + src2_data, src2_offset, dst_data, dst_offset, + depends); + + return std::make_pair( + dpctl::utils::keep_args_alive( + exec_q, {src1, src2, dst}, host_tasks), + comp_ev); + } + } + } + if (isEqual(simplified_src1_strides, one_zero_strides) && + isEqual(simplified_src2_strides, {one, simplified_shape[0]}) && + isEqual(simplified_dst_strides, {one, simplified_shape[0]})) { + auto row_matrix_broadcast_fn = + contig_row_matrix_broadcast_dispatch_table[src1_typeid] + [src2_typeid]; + if (row_matrix_broadcast_fn != nullptr) { + + int src1_itemsize = src1.get_elemsize(); + int src2_itemsize = src2.get_elemsize(); + int dst_itemsize = dst.get_elemsize(); + + if (is_aligned( + src1_data + src1_offset * src1_itemsize) && + is_aligned( + src2_data + src2_offset * src2_itemsize) && + is_aligned( + dst_data + dst_offset * dst_itemsize)) { + std::size_t n0 = simplified_shape[1]; + std::size_t n1 = simplified_shape[0]; + sycl::event comp_ev = row_matrix_broadcast_fn( + exec_q, host_tasks, n0, n1, src1_data, src1_offset, + src2_data, src2_offset, dst_data, dst_offset, + depends); + + return std::make_pair( + dpctl::utils::keep_args_alive( + exec_q, {src1, src2, dst}, host_tasks), + comp_ev); + } + } + } + } + } + + // dispatch to strided code + auto strided_fn = strided_dispatch_table[src1_typeid][src2_typeid]; + + if (strided_fn == nullptr) { + throw std::runtime_error( + "Strided implementation is missing for src1_typeid=" + + std::to_string(src1_typeid) + + " and src2_typeid=" + std::to_string(src2_typeid)); + } + + using dpctl::tensor::offset_utils::device_allocate_and_pack; + auto ptr_sz_event_triple_ = device_allocate_and_pack( + exec_q, host_tasks, simplified_shape, simplified_src1_strides, + simplified_src2_strides, simplified_dst_strides); + auto shape_strides_owner = std::move(std::get<0>(ptr_sz_event_triple_)); + auto ©_shape_ev = std::get<2>(ptr_sz_event_triple_); + + const py::ssize_t *shape_strides = shape_strides_owner.get(); + + sycl::event strided_fn_ev = strided_fn( + exec_q, src_nelems, nd, shape_strides, src1_data, src1_offset, + src2_data, src2_offset, dst_data, dst_offset, depends, {copy_shape_ev}); + + // async free of shape_strides temporary + sycl::event tmp_cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {strided_fn_ev}, shape_strides_owner); + host_tasks.push_back(tmp_cleanup_ev); + + return std::make_pair( + dpctl::utils::keep_args_alive(exec_q, {src1, src2, dst}, host_tasks), + strided_fn_ev); +} + +/*! @brief Type querying for binary elementwise functions */ +template +py::object py_binary_ufunc_result_type(const py::dtype &input1_dtype, + const py::dtype &input2_dtype, + const output_typesT &output_types_table) +{ + int tn1 = input1_dtype.num(); // NumPy type numbers are the same as in dpctl + int tn2 = input2_dtype.num(); // NumPy type numbers are the same as in dpctl + int src1_typeid = -1; + int src2_typeid = -1; + + auto array_types = td_ns::usm_ndarray_types(); + + try { + src1_typeid = array_types.typenum_to_lookup_id(tn1); + src2_typeid = array_types.typenum_to_lookup_id(tn2); + } catch (const std::exception &e) { + throw py::value_error(e.what()); + } + + if (src1_typeid < 0 || src1_typeid >= td_ns::num_types || src2_typeid < 0 || + src2_typeid >= td_ns::num_types) { + throw std::runtime_error("binary output type lookup failed"); + } + int dst_typeid = output_types_table[src1_typeid][src2_typeid]; + + if (dst_typeid < 0) { + auto res = py::none(); + return py::cast(res); + } + else { + using type_utils::_dtype_from_typenum; + + auto dst_typenum_t = static_cast(dst_typeid); + auto dt = _dtype_from_typenum(dst_typenum_t); + + return py::cast(dt); + } +} + +// ==================== Inplace binary functions ======================= + +template +std::pair + py_binary_inplace_ufunc(const dpctl::tensor::usm_ndarray &lhs, + const dpctl::tensor::usm_ndarray &rhs, + sycl::queue &exec_q, + const std::vector depends, + // + const output_typesT &output_type_table, + const contig_dispatchT &contig_dispatch_table, + const strided_dispatchT &strided_dispatch_table, + const contig_row_matrix_dispatchT + &contig_row_matrix_broadcast_dispatch_table) +{ + dpctl::tensor::validation::CheckWritable::throw_if_not_writable(lhs); + + // check type_nums + int rhs_typenum = rhs.get_typenum(); + int lhs_typenum = lhs.get_typenum(); + + auto array_types = td_ns::usm_ndarray_types(); + int rhs_typeid = array_types.typenum_to_lookup_id(rhs_typenum); + int lhs_typeid = array_types.typenum_to_lookup_id(lhs_typenum); + + int output_typeid = output_type_table[rhs_typeid][lhs_typeid]; + + if (output_typeid != lhs_typeid) { + throw py::value_error( + "Left-hand side array has unexpected elemental data type."); + } + + // check that queues are compatible + if (!dpctl::utils::queues_are_compatible(exec_q, {rhs, lhs})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + // check shapes, broadcasting is assumed done by caller + // check that dimensions are the same + int lhs_nd = lhs.get_ndim(); + if (lhs_nd != rhs.get_ndim()) { + throw py::value_error("Array dimensions are not the same."); + } + + // check that shapes are the same + const py::ssize_t *rhs_shape = rhs.get_shape_raw(); + const py::ssize_t *lhs_shape = lhs.get_shape_raw(); + bool shapes_equal(true); + std::size_t rhs_nelems(1); + + for (int i = 0; i < lhs_nd; ++i) { + rhs_nelems *= static_cast(rhs_shape[i]); + shapes_equal = shapes_equal && (rhs_shape[i] == lhs_shape[i]); + } + if (!shapes_equal) { + throw py::value_error("Array shapes are not the same."); + } + + // if nelems is zero, return + if (rhs_nelems == 0) { + return std::make_pair(sycl::event(), sycl::event()); + } + + dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(lhs, rhs_nelems); + + // check memory overlap + auto const &same_logical_tensors = + dpctl::tensor::overlap::SameLogicalTensors(); + auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); + if (overlap(rhs, lhs) && !same_logical_tensors(rhs, lhs)) { + throw py::value_error("Arrays index overlapping segments of memory"); + } + // check memory overlap + const char *rhs_data = rhs.get_data(); + char *lhs_data = lhs.get_data(); + + // handle contiguous inputs + bool is_rhs_c_contig = rhs.is_c_contiguous(); + bool is_rhs_f_contig = rhs.is_f_contiguous(); + + bool is_lhs_c_contig = lhs.is_c_contiguous(); + bool is_lhs_f_contig = lhs.is_f_contiguous(); + + bool both_c_contig = (is_rhs_c_contig && is_lhs_c_contig); + bool both_f_contig = (is_rhs_f_contig && is_lhs_f_contig); + + // dispatch for contiguous inputs + if (both_c_contig || both_f_contig) { + auto contig_fn = contig_dispatch_table[rhs_typeid][lhs_typeid]; + + if (contig_fn != nullptr) { + auto comp_ev = contig_fn(exec_q, rhs_nelems, rhs_data, 0, lhs_data, + 0, depends); + sycl::event ht_ev = + dpctl::utils::keep_args_alive(exec_q, {rhs, lhs}, {comp_ev}); + + return std::make_pair(ht_ev, comp_ev); + } + } + + // simplify strides + auto const &rhs_strides = rhs.get_strides_vector(); + auto const &lhs_strides = lhs.get_strides_vector(); + + using shT = std::vector; + shT simplified_shape; + shT simplified_rhs_strides; + shT simplified_lhs_strides; + py::ssize_t rhs_offset(0); + py::ssize_t lhs_offset(0); + + int nd = lhs_nd; + const py::ssize_t *shape = rhs_shape; + + simplify_iteration_space(nd, shape, rhs_strides, lhs_strides, + // outputs + simplified_shape, simplified_rhs_strides, + simplified_lhs_strides, rhs_offset, lhs_offset); + + std::vector host_tasks{}; + if (nd < 3) { + static constexpr auto unit_stride = + std::initializer_list{1}; + + if ((nd == 1) && isEqual(simplified_rhs_strides, unit_stride) && + isEqual(simplified_lhs_strides, unit_stride)) { + auto contig_fn = contig_dispatch_table[rhs_typeid][lhs_typeid]; + + if (contig_fn != nullptr) { + auto comp_ev = + contig_fn(exec_q, rhs_nelems, rhs_data, rhs_offset, + lhs_data, lhs_offset, depends); + sycl::event ht_ev = dpctl::utils::keep_args_alive( + exec_q, {rhs, lhs}, {comp_ev}); + + return std::make_pair(ht_ev, comp_ev); + } + } + if (nd == 2) { + static constexpr auto one_zero_strides = + std::initializer_list{1, 0}; + static constexpr py::ssize_t one{1}; + // special case of C-contiguous matrix and a row + if (isEqual(simplified_rhs_strides, one_zero_strides) && + isEqual(simplified_lhs_strides, {one, simplified_shape[0]})) { + auto row_matrix_broadcast_fn = + contig_row_matrix_broadcast_dispatch_table[rhs_typeid] + [lhs_typeid]; + if (row_matrix_broadcast_fn != nullptr) { + std::size_t n0 = simplified_shape[1]; + std::size_t n1 = simplified_shape[0]; + sycl::event comp_ev = row_matrix_broadcast_fn( + exec_q, host_tasks, n0, n1, rhs_data, rhs_offset, + lhs_data, lhs_offset, depends); + + return std::make_pair(dpctl::utils::keep_args_alive( + exec_q, {lhs, rhs}, host_tasks), + comp_ev); + } + } + } + } + + // dispatch to strided code + auto strided_fn = strided_dispatch_table[rhs_typeid][lhs_typeid]; + + if (strided_fn == nullptr) { + throw std::runtime_error( + "Strided implementation is missing for rhs_typeid=" + + std::to_string(rhs_typeid) + + " and lhs_typeid=" + std::to_string(lhs_typeid)); + } + + using dpctl::tensor::offset_utils::device_allocate_and_pack; + auto ptr_sz_event_triple_ = device_allocate_and_pack( + exec_q, host_tasks, simplified_shape, simplified_rhs_strides, + simplified_lhs_strides); + auto shape_strides_owner = std::move(std::get<0>(ptr_sz_event_triple_)); + auto copy_shape_ev = std::get<2>(ptr_sz_event_triple_); + + const py::ssize_t *shape_strides = shape_strides_owner.get(); + + sycl::event strided_fn_ev = + strided_fn(exec_q, rhs_nelems, nd, shape_strides, rhs_data, rhs_offset, + lhs_data, lhs_offset, depends, {copy_shape_ev}); + + // async free of shape_strides temporary + sycl::event tmp_cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {strided_fn_ev}, shape_strides_owner); + + host_tasks.push_back(tmp_cleanup_ev); + + return std::make_pair( + dpctl::utils::keep_args_alive(exec_q, {rhs, lhs}, host_tasks), + strided_fn_ev); +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/elementwise_functions_type_utils.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/elementwise_functions_type_utils.cpp new file mode 100644 index 000000000000..7d327ada7349 --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/elementwise_functions_type_utils.cpp @@ -0,0 +1,96 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions for looking of supported types in elementwise +/// functions. +//===---------------------------------------------------------------------===// + +#include + +#include +#include + +#include "elementwise_functions_type_utils.hpp" +#include "utils/type_dispatch.hpp" + +namespace dpctl::tensor::py_internal::type_utils +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +py::dtype _dtype_from_typenum(td_ns::typenum_t dst_typenum_t) +{ + switch (dst_typenum_t) { + case td_ns::typenum_t::BOOL: + return py::dtype("?"); + case td_ns::typenum_t::INT8: + return py::dtype("i1"); + case td_ns::typenum_t::UINT8: + return py::dtype("u1"); + case td_ns::typenum_t::INT16: + return py::dtype("i2"); + case td_ns::typenum_t::UINT16: + return py::dtype("u2"); + case td_ns::typenum_t::INT32: + return py::dtype("i4"); + case td_ns::typenum_t::UINT32: + return py::dtype("u4"); + case td_ns::typenum_t::INT64: + return py::dtype("i8"); + case td_ns::typenum_t::UINT64: + return py::dtype("u8"); + case td_ns::typenum_t::HALF: + return py::dtype("f2"); + case td_ns::typenum_t::FLOAT: + return py::dtype("f4"); + case td_ns::typenum_t::DOUBLE: + return py::dtype("f8"); + case td_ns::typenum_t::CFLOAT: + return py::dtype("c8"); + case td_ns::typenum_t::CDOUBLE: + return py::dtype("c16"); + default: + throw py::value_error("Unrecognized dst_typeid"); + } +} + +int _result_typeid(int arg_typeid, const int *fn_output_id) +{ + if (arg_typeid < 0 || arg_typeid >= td_ns::num_types) { + throw py::value_error("Input typeid " + std::to_string(arg_typeid) + + " is outside of expected bounds."); + } + + return fn_output_id[arg_typeid]; +} + +} // namespace dpctl::tensor::py_internal::type_utils diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/elementwise_functions_type_utils.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/elementwise_functions_type_utils.hpp new file mode 100644 index 000000000000..d3324feb3470 --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/elementwise_functions_type_utils.hpp @@ -0,0 +1,56 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions for looking of supported types in elementwise +/// functions. +//===---------------------------------------------------------------------===// + +#pragma once + +#include +#include + +#include "utils/type_dispatch.hpp" + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace dpctl::tensor::py_internal::type_utils +{ + +/*! @brief Produce dtype from a type number */ +extern py::dtype _dtype_from_typenum(td_ns::typenum_t); + +/*! @brief Lookup typeid of the result from typeid of + * argument and the mapping table */ +extern int _result_typeid(int, const int *); + +} // namespace dpctl::tensor::py_internal::type_utils diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/equal.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/equal.cpp new file mode 100644 index 000000000000..863501bea367 --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/equal.cpp @@ -0,0 +1,145 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "equal.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/equal.hpp" + +namespace dpctl::tensor::py_internal +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::binary_contig_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_strided_impl_fn_ptr_t; + +// B09: ===== EQUAL (x1, x2) +namespace impl +{ +namespace equal_fn_ns = dpctl::tensor::kernels::equal; + +static binary_contig_impl_fn_ptr_t + equal_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; +static int equal_output_id_table[td_ns::num_types][td_ns::num_types]; + +static binary_strided_impl_fn_ptr_t + equal_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; + +void populate_equal_dispatch_tables(void) +{ + using namespace td_ns; + namespace fn_ns = equal_fn_ns; + + // which input types are supported, and what is the type of the result + using fn_ns::EqualTypeMapFactory; + DispatchTableBuilder dtb1; + dtb1.populate_dispatch_table(equal_output_id_table); + + // function pointers for operation on general strided arrays + using fn_ns::EqualStridedFactory; + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(equal_strided_dispatch_table); + + // function pointers for operation on contiguous inputs and output + using fn_ns::EqualContigFactory; + DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table(equal_contig_dispatch_table); +}; + +} // namespace impl + +void init_equal(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_equal_dispatch_tables(); + using impl::equal_contig_dispatch_table; + using impl::equal_output_id_table; + using impl::equal_strided_dispatch_table; + + auto equal_pyapi = [&](const arrayT &src1, const arrayT &src2, + const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_binary_ufunc( + src1, src2, dst, exec_q, depends, equal_output_id_table, + // function pointers to handle operation on contiguous arrays + // (pointers may be nullptr) + equal_contig_dispatch_table, + // function pointers to handle operation on strided arrays (most + // general case) + equal_strided_dispatch_table, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{}, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{}); + }; + auto equal_result_type_pyapi = [&](const py::dtype &dtype1, + const py::dtype &dtype2) { + return py_binary_ufunc_result_type(dtype1, dtype2, + equal_output_id_table); + }; + m.def("_equal", equal_pyapi, "", py::arg("src1"), py::arg("src2"), + py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + m.def("_equal_result_type", equal_result_type_pyapi, ""); + } +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/equal.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/equal.hpp new file mode 100644 index 000000000000..23f370111458 --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/equal.hpp @@ -0,0 +1,46 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern void init_equal(py::module_ m); + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/exp.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/exp.cpp new file mode 100644 index 000000000000..cd3cd65107f7 --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/exp.cpp @@ -0,0 +1,125 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "exp.hpp" +#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/exp.hpp" + +namespace dpctl::tensor::py_internal +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::unary_contig_impl_fn_ptr_t; +using ew_cmn_ns::unary_strided_impl_fn_ptr_t; + +// U13: ==== EXP (x) +namespace impl +{ + +namespace exp_fn_ns = dpctl::tensor::kernels::exp; + +static unary_contig_impl_fn_ptr_t exp_contig_dispatch_vector[td_ns::num_types]; +static int exp_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + exp_strided_dispatch_vector[td_ns::num_types]; + +void populate_exp_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = exp_fn_ns; + + using fn_ns::ExpContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(exp_contig_dispatch_vector); + + using fn_ns::ExpStridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(exp_strided_dispatch_vector); + + using fn_ns::ExpTypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(exp_output_typeid_vector); +}; + +} // namespace impl + +void init_exp(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_exp_dispatch_vectors(); + using impl::exp_contig_dispatch_vector; + using impl::exp_output_typeid_vector; + using impl::exp_strided_dispatch_vector; + + auto exp_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc( + src, dst, exec_q, depends, exp_output_typeid_vector, + exp_contig_dispatch_vector, exp_strided_dispatch_vector); + }; + m.def("_exp", exp_pyapi, "", py::arg("src"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto exp_result_type_pyapi = [&](const py::dtype &dtype) { + return py_unary_ufunc_result_type(dtype, exp_output_typeid_vector); + }; + m.def("_exp_result_type", exp_result_type_pyapi); + } +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/exp.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/exp.hpp new file mode 100644 index 000000000000..14b757a18e92 --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/exp.hpp @@ -0,0 +1,46 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern void init_exp(py::module_ m); + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/exp2.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/exp2.cpp new file mode 100644 index 000000000000..fc40a8e0aab9 --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/exp2.cpp @@ -0,0 +1,125 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "exp2.hpp" +#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/exp2.hpp" + +namespace dpctl::tensor::py_internal +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::unary_contig_impl_fn_ptr_t; +using ew_cmn_ns::unary_strided_impl_fn_ptr_t; + +// U38: ==== EXP2 (x) +namespace impl +{ + +namespace exp2_fn_ns = dpctl::tensor::kernels::exp2; + +static unary_contig_impl_fn_ptr_t exp2_contig_dispatch_vector[td_ns::num_types]; +static int exp2_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + exp2_strided_dispatch_vector[td_ns::num_types]; + +void populate_exp2_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = exp2_fn_ns; + + using fn_ns::Exp2ContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(exp2_contig_dispatch_vector); + + using fn_ns::Exp2StridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(exp2_strided_dispatch_vector); + + using fn_ns::Exp2TypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(exp2_output_typeid_vector); +}; + +} // namespace impl + +void init_exp2(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_exp2_dispatch_vectors(); + using impl::exp2_contig_dispatch_vector; + using impl::exp2_output_typeid_vector; + using impl::exp2_strided_dispatch_vector; + + auto exp2_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc( + src, dst, exec_q, depends, exp2_output_typeid_vector, + exp2_contig_dispatch_vector, exp2_strided_dispatch_vector); + }; + m.def("_exp2", exp2_pyapi, "", py::arg("src"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto exp2_result_type_pyapi = [&](const py::dtype &dtype) { + return py_unary_ufunc_result_type(dtype, exp2_output_typeid_vector); + }; + m.def("_exp2_result_type", exp2_result_type_pyapi); + } +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/exp2.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/exp2.hpp new file mode 100644 index 000000000000..f9f315d14383 --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/exp2.hpp @@ -0,0 +1,46 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern void init_exp2(py::module_ m); + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/expm1.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/expm1.cpp new file mode 100644 index 000000000000..b4770b7b819c --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/expm1.cpp @@ -0,0 +1,127 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "expm1.hpp" +#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/expm1.hpp" + +namespace dpctl::tensor::py_internal +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::unary_contig_impl_fn_ptr_t; +using ew_cmn_ns::unary_strided_impl_fn_ptr_t; + +// U14: ==== EXPM1 (x) +namespace impl +{ + +namespace expm1_fn_ns = dpctl::tensor::kernels::expm1; + +static unary_contig_impl_fn_ptr_t + expm1_contig_dispatch_vector[td_ns::num_types]; +static int expm1_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + expm1_strided_dispatch_vector[td_ns::num_types]; + +void populate_expm1_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = expm1_fn_ns; + + using fn_ns::Expm1ContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(expm1_contig_dispatch_vector); + + using fn_ns::Expm1StridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(expm1_strided_dispatch_vector); + + using fn_ns::Expm1TypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(expm1_output_typeid_vector); +}; + +} // namespace impl + +void init_expm1(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_expm1_dispatch_vectors(); + using impl::expm1_contig_dispatch_vector; + using impl::expm1_output_typeid_vector; + using impl::expm1_strided_dispatch_vector; + + auto expm1_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc( + src, dst, exec_q, depends, expm1_output_typeid_vector, + expm1_contig_dispatch_vector, expm1_strided_dispatch_vector); + }; + m.def("_expm1", expm1_pyapi, "", py::arg("src"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto expm1_result_type_pyapi = [&](const py::dtype &dtype) { + return py_unary_ufunc_result_type(dtype, + expm1_output_typeid_vector); + }; + m.def("_expm1_result_type", expm1_result_type_pyapi); + } +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/expm1.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/expm1.hpp new file mode 100644 index 000000000000..4f373fe67dff --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/expm1.hpp @@ -0,0 +1,46 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern void init_expm1(py::module_ m); + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/floor.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/floor.cpp new file mode 100644 index 000000000000..2a81ce6552a9 --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/floor.cpp @@ -0,0 +1,127 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "floor.hpp" +#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/floor.hpp" + +namespace dpctl::tensor::py_internal +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::unary_contig_impl_fn_ptr_t; +using ew_cmn_ns::unary_strided_impl_fn_ptr_t; + +// U15: ==== FLOOR (x) +namespace impl +{ + +namespace floor_fn_ns = dpctl::tensor::kernels::floor; + +static unary_contig_impl_fn_ptr_t + floor_contig_dispatch_vector[td_ns::num_types]; +static int floor_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + floor_strided_dispatch_vector[td_ns::num_types]; + +void populate_floor_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = floor_fn_ns; + + using fn_ns::FloorContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(floor_contig_dispatch_vector); + + using fn_ns::FloorStridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(floor_strided_dispatch_vector); + + using fn_ns::FloorTypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(floor_output_typeid_vector); +}; + +} // namespace impl + +void init_floor(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_floor_dispatch_vectors(); + using impl::floor_contig_dispatch_vector; + using impl::floor_output_typeid_vector; + using impl::floor_strided_dispatch_vector; + + auto floor_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc( + src, dst, exec_q, depends, floor_output_typeid_vector, + floor_contig_dispatch_vector, floor_strided_dispatch_vector); + }; + m.def("_floor", floor_pyapi, "", py::arg("src"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto floor_result_type_pyapi = [&](const py::dtype &dtype) { + return py_unary_ufunc_result_type(dtype, + floor_output_typeid_vector); + }; + m.def("_floor_result_type", floor_result_type_pyapi); + } +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/floor.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/floor.hpp new file mode 100644 index 000000000000..5e5fe41ce313 --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/floor.hpp @@ -0,0 +1,46 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern void init_floor(py::module_ m); + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/floor_divide.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/floor_divide.cpp new file mode 100644 index 000000000000..af4635a0f500 --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/floor_divide.cpp @@ -0,0 +1,205 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "floor_divide.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/common_inplace.hpp" +#include "kernels/elementwise_functions/floor_divide.hpp" + +namespace dpctl::tensor::py_internal +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::binary_contig_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_strided_impl_fn_ptr_t; + +using ew_cmn_ns::binary_inplace_contig_impl_fn_ptr_t; +using ew_cmn_ns::binary_inplace_row_matrix_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_inplace_strided_impl_fn_ptr_t; + +// B10: ===== FLOOR_DIVIDE (x1, x2) +namespace impl +{ +namespace floor_divide_fn_ns = dpctl::tensor::kernels::floor_divide; + +static binary_contig_impl_fn_ptr_t + floor_divide_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; + +static int floor_divide_output_id_table[td_ns::num_types][td_ns::num_types]; +static int floor_divide_inplace_output_id_table[td_ns::num_types] + [td_ns::num_types]; + +static binary_strided_impl_fn_ptr_t + floor_divide_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; + +static binary_inplace_contig_impl_fn_ptr_t + floor_divide_inplace_contig_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static binary_inplace_strided_impl_fn_ptr_t + floor_divide_inplace_strided_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +void populate_floor_divide_dispatch_tables(void) +{ + using namespace td_ns; + namespace fn_ns = floor_divide_fn_ns; + + // which input types are supported, and what is the type of the result + using fn_ns::FloorDivideTypeMapFactory; + DispatchTableBuilder dtb1; + dtb1.populate_dispatch_table(floor_divide_output_id_table); + + // function pointers for operation on general strided arrays + using fn_ns::FloorDivideStridedFactory; + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(floor_divide_strided_dispatch_table); + + // function pointers for operation on contiguous inputs and output + using fn_ns::FloorDivideContigFactory; + DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table(floor_divide_contig_dispatch_table); + + // function pointers for inplace operation on general strided arrays + using fn_ns::FloorDivideInplaceStridedFactory; + DispatchTableBuilder + dtb4; + dtb4.populate_dispatch_table(floor_divide_inplace_strided_dispatch_table); + + // function pointers for inplace operation on contiguous inputs and output + using fn_ns::FloorDivideInplaceContigFactory; + DispatchTableBuilder + dtb5; + dtb5.populate_dispatch_table(floor_divide_inplace_contig_dispatch_table); + + // which types are supported by the in-place kernels + using fn_ns::FloorDivideInplaceTypeMapFactory; + DispatchTableBuilder dtb6; + dtb6.populate_dispatch_table(floor_divide_inplace_output_id_table); +}; + +} // namespace impl + +void init_floor_divide(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_floor_divide_dispatch_tables(); + using impl::floor_divide_contig_dispatch_table; + using impl::floor_divide_output_id_table; + using impl::floor_divide_strided_dispatch_table; + + auto floor_divide_pyapi = [&](const arrayT &src1, const arrayT &src2, + const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_binary_ufunc( + src1, src2, dst, exec_q, depends, floor_divide_output_id_table, + // function pointers to handle operation on contiguous arrays + // (pointers may be nullptr) + floor_divide_contig_dispatch_table, + // function pointers to handle operation on strided arrays (most + // general case) + floor_divide_strided_dispatch_table, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{}, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{}); + }; + auto floor_divide_result_type_pyapi = [&](const py::dtype &dtype1, + const py::dtype &dtype2) { + return py_binary_ufunc_result_type(dtype1, dtype2, + floor_divide_output_id_table); + }; + m.def("_floor_divide", floor_divide_pyapi, "", py::arg("src1"), + py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + m.def("_floor_divide_result_type", floor_divide_result_type_pyapi, ""); + + using impl::floor_divide_inplace_contig_dispatch_table; + using impl::floor_divide_inplace_output_id_table; + using impl::floor_divide_inplace_strided_dispatch_table; + + auto floor_divide_inplace_pyapi = [&](const arrayT &src, + const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_binary_inplace_ufunc( + src, dst, exec_q, depends, floor_divide_inplace_output_id_table, + // function pointers to handle inplace operation on + // contiguous arrays (pointers may be nullptr) + floor_divide_inplace_contig_dispatch_table, + // function pointers to handle inplace operation on strided + // arrays (most general case) + floor_divide_inplace_strided_dispatch_table, + // function pointers to handle inplace operation on + // c-contig matrix with c-contig row with broadcasting + // (may be nullptr) + td_ns::NullPtrTable< + binary_inplace_row_matrix_broadcast_impl_fn_ptr_t>{}); + }; + m.def("_floor_divide_inplace", floor_divide_inplace_pyapi, "", + py::arg("lhs"), py::arg("rhs"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + } +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/floor_divide.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/floor_divide.hpp new file mode 100644 index 000000000000..17d493b58057 --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/floor_divide.hpp @@ -0,0 +1,46 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern void init_floor_divide(py::module_ m); + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/greater.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/greater.cpp new file mode 100644 index 000000000000..f3cfaeae2286 --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/greater.cpp @@ -0,0 +1,145 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "greater.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/greater.hpp" + +namespace dpctl::tensor::py_internal +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::binary_contig_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_strided_impl_fn_ptr_t; + +// B11: ===== GREATER (x1, x2) +namespace impl +{ +namespace greater_fn_ns = dpctl::tensor::kernels::greater; + +static binary_contig_impl_fn_ptr_t + greater_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; +static int greater_output_id_table[td_ns::num_types][td_ns::num_types]; + +static binary_strided_impl_fn_ptr_t + greater_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; + +void populate_greater_dispatch_tables(void) +{ + using namespace td_ns; + namespace fn_ns = greater_fn_ns; + + // which input types are supported, and what is the type of the result + using fn_ns::GreaterTypeMapFactory; + DispatchTableBuilder dtb1; + dtb1.populate_dispatch_table(greater_output_id_table); + + // function pointers for operation on general strided arrays + using fn_ns::GreaterStridedFactory; + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(greater_strided_dispatch_table); + + // function pointers for operation on contiguous inputs and output + using fn_ns::GreaterContigFactory; + DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table(greater_contig_dispatch_table); +}; + +} // namespace impl + +void init_greater(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_greater_dispatch_tables(); + using impl::greater_contig_dispatch_table; + using impl::greater_output_id_table; + using impl::greater_strided_dispatch_table; + + auto greater_pyapi = [&](const arrayT &src1, const arrayT &src2, + const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_binary_ufunc( + src1, src2, dst, exec_q, depends, greater_output_id_table, + // function pointers to handle operation on contiguous arrays + // (pointers may be nullptr) + greater_contig_dispatch_table, + // function pointers to handle operation on strided arrays (most + // general case) + greater_strided_dispatch_table, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{}, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{}); + }; + auto greater_result_type_pyapi = [&](const py::dtype &dtype1, + const py::dtype &dtype2) { + return py_binary_ufunc_result_type(dtype1, dtype2, + greater_output_id_table); + }; + m.def("_greater", greater_pyapi, "", py::arg("src1"), py::arg("src2"), + py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + m.def("_greater_result_type", greater_result_type_pyapi, ""); + } +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/greater.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/greater.hpp new file mode 100644 index 000000000000..c8c3caa5f1fd --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/greater.hpp @@ -0,0 +1,46 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern void init_greater(py::module_ m); + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/greater_equal.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/greater_equal.cpp new file mode 100644 index 000000000000..ad9af91ce3d8 --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/greater_equal.cpp @@ -0,0 +1,146 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "greater_equal.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/greater_equal.hpp" + +namespace dpctl::tensor::py_internal +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::binary_contig_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_strided_impl_fn_ptr_t; + +// B12: ===== GREATER_EQUAL (x1, x2) +namespace impl +{ +namespace greater_equal_fn_ns = dpctl::tensor::kernels::greater_equal; + +static binary_contig_impl_fn_ptr_t + greater_equal_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; +static int greater_equal_output_id_table[td_ns::num_types][td_ns::num_types]; + +static binary_strided_impl_fn_ptr_t + greater_equal_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; + +void populate_greater_equal_dispatch_tables(void) +{ + using namespace td_ns; + namespace fn_ns = greater_equal_fn_ns; + + // which input types are supported, and what is the type of the result + using fn_ns::GreaterEqualTypeMapFactory; + DispatchTableBuilder dtb1; + dtb1.populate_dispatch_table(greater_equal_output_id_table); + + // function pointers for operation on general strided arrays + using fn_ns::GreaterEqualStridedFactory; + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(greater_equal_strided_dispatch_table); + + // function pointers for operation on contiguous inputs and output + using fn_ns::GreaterEqualContigFactory; + DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table(greater_equal_contig_dispatch_table); +}; + +} // namespace impl + +void init_greater_equal(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_greater_equal_dispatch_tables(); + using impl::greater_equal_contig_dispatch_table; + using impl::greater_equal_output_id_table; + using impl::greater_equal_strided_dispatch_table; + + auto greater_equal_pyapi = [&](const arrayT &src1, const arrayT &src2, + const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_binary_ufunc( + src1, src2, dst, exec_q, depends, greater_equal_output_id_table, + // function pointers to handle operation on contiguous arrays + // (pointers may be nullptr) + greater_equal_contig_dispatch_table, + // function pointers to handle operation on strided arrays (most + // general case) + greater_equal_strided_dispatch_table, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{}, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{}); + }; + auto greater_equal_result_type_pyapi = [&](const py::dtype &dtype1, + const py::dtype &dtype2) { + return py_binary_ufunc_result_type(dtype1, dtype2, + greater_equal_output_id_table); + }; + m.def("_greater_equal", greater_equal_pyapi, "", py::arg("src1"), + py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + m.def("_greater_equal_result_type", greater_equal_result_type_pyapi, + ""); + } +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/greater_equal.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/greater_equal.hpp new file mode 100644 index 000000000000..0cf7f8e89bbf --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/greater_equal.hpp @@ -0,0 +1,46 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern void init_greater_equal(py::module_ m); + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/hypot.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/hypot.cpp new file mode 100644 index 000000000000..f4ce161f4cda --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/hypot.cpp @@ -0,0 +1,145 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "hypot.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/hypot.hpp" + +namespace dpctl::tensor::py_internal +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::binary_contig_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_strided_impl_fn_ptr_t; + +// B24: ===== HYPOT (x1, x2) +namespace impl +{ +namespace hypot_fn_ns = dpctl::tensor::kernels::hypot; + +static binary_contig_impl_fn_ptr_t + hypot_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; +static int hypot_output_id_table[td_ns::num_types][td_ns::num_types]; + +static binary_strided_impl_fn_ptr_t + hypot_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; + +void populate_hypot_dispatch_tables(void) +{ + using namespace td_ns; + namespace fn_ns = hypot_fn_ns; + + // which input types are supported, and what is the type of the result + using fn_ns::HypotTypeMapFactory; + DispatchTableBuilder dtb1; + dtb1.populate_dispatch_table(hypot_output_id_table); + + // function pointers for operation on general strided arrays + using fn_ns::HypotStridedFactory; + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(hypot_strided_dispatch_table); + + // function pointers for operation on contiguous inputs and output + using fn_ns::HypotContigFactory; + DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table(hypot_contig_dispatch_table); +}; + +} // namespace impl + +void init_hypot(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_hypot_dispatch_tables(); + using impl::hypot_contig_dispatch_table; + using impl::hypot_output_id_table; + using impl::hypot_strided_dispatch_table; + + auto hypot_pyapi = [&](const arrayT &src1, const arrayT &src2, + const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_binary_ufunc( + src1, src2, dst, exec_q, depends, hypot_output_id_table, + // function pointers to handle operation on contiguous arrays + // (pointers may be nullptr) + hypot_contig_dispatch_table, + // function pointers to handle operation on strided arrays (most + // general case) + hypot_strided_dispatch_table, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{}, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{}); + }; + auto hypot_result_type_pyapi = [&](const py::dtype &dtype1, + const py::dtype &dtype2) { + return py_binary_ufunc_result_type(dtype1, dtype2, + hypot_output_id_table); + }; + m.def("_hypot", hypot_pyapi, "", py::arg("src1"), py::arg("src2"), + py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + m.def("_hypot_result_type", hypot_result_type_pyapi, ""); + } +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/hypot.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/hypot.hpp new file mode 100644 index 000000000000..5bc73e717ad3 --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/hypot.hpp @@ -0,0 +1,46 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern void init_hypot(py::module_ m); + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/imag.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/imag.cpp new file mode 100644 index 000000000000..833295d22891 --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/imag.cpp @@ -0,0 +1,125 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "imag.hpp" +#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/imag.hpp" + +namespace dpctl::tensor::py_internal +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::unary_contig_impl_fn_ptr_t; +using ew_cmn_ns::unary_strided_impl_fn_ptr_t; + +// U16: ==== IMAG (x) +namespace impl +{ + +namespace imag_fn_ns = dpctl::tensor::kernels::imag; + +static unary_contig_impl_fn_ptr_t imag_contig_dispatch_vector[td_ns::num_types]; +static int imag_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + imag_strided_dispatch_vector[td_ns::num_types]; + +void populate_imag_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = imag_fn_ns; + + using fn_ns::ImagContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(imag_contig_dispatch_vector); + + using fn_ns::ImagStridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(imag_strided_dispatch_vector); + + using fn_ns::ImagTypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(imag_output_typeid_vector); +}; + +} // namespace impl + +void init_imag(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_imag_dispatch_vectors(); + using impl::imag_contig_dispatch_vector; + using impl::imag_output_typeid_vector; + using impl::imag_strided_dispatch_vector; + + auto imag_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc( + src, dst, exec_q, depends, imag_output_typeid_vector, + imag_contig_dispatch_vector, imag_strided_dispatch_vector); + }; + m.def("_imag", imag_pyapi, "", py::arg("src"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto imag_result_type_pyapi = [&](const py::dtype &dtype) { + return py_unary_ufunc_result_type(dtype, imag_output_typeid_vector); + }; + m.def("_imag_result_type", imag_result_type_pyapi); + } +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/imag.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/imag.hpp new file mode 100644 index 000000000000..7cc285855328 --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/imag.hpp @@ -0,0 +1,46 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern void init_imag(py::module_ m); + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/isfinite.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/isfinite.cpp new file mode 100644 index 000000000000..1882406b37f3 --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/isfinite.cpp @@ -0,0 +1,128 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "isfinite.hpp" +#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/isfinite.hpp" + +namespace dpctl::tensor::py_internal +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::unary_contig_impl_fn_ptr_t; +using ew_cmn_ns::unary_strided_impl_fn_ptr_t; + +// U17: ==== ISFINITE (x) +namespace impl +{ + +namespace isfinite_fn_ns = dpctl::tensor::kernels::isfinite; + +static unary_contig_impl_fn_ptr_t + isfinite_contig_dispatch_vector[td_ns::num_types]; +static int isfinite_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + isfinite_strided_dispatch_vector[td_ns::num_types]; + +void populate_isfinite_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = isfinite_fn_ns; + + using fn_ns::IsFiniteContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(isfinite_contig_dispatch_vector); + + using fn_ns::IsFiniteStridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(isfinite_strided_dispatch_vector); + + using fn_ns::IsFiniteTypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(isfinite_output_typeid_vector); +}; + +} // namespace impl + +void init_isfinite(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_isfinite_dispatch_vectors(); + using impl::isfinite_contig_dispatch_vector; + using impl::isfinite_output_typeid_vector; + using impl::isfinite_strided_dispatch_vector; + + auto isfinite_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc(src, dst, exec_q, depends, + isfinite_output_typeid_vector, + isfinite_contig_dispatch_vector, + isfinite_strided_dispatch_vector); + }; + m.def("_isfinite", isfinite_pyapi, "", py::arg("src"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto isfinite_result_type_pyapi = [&](const py::dtype &dtype) { + return py_unary_ufunc_result_type(dtype, + isfinite_output_typeid_vector); + }; + m.def("_isfinite_result_type", isfinite_result_type_pyapi); + } +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/isfinite.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/isfinite.hpp new file mode 100644 index 000000000000..31691916c1f8 --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/isfinite.hpp @@ -0,0 +1,46 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern void init_isfinite(py::module_ m); + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/isinf.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/isinf.cpp new file mode 100644 index 000000000000..b6bb5605412c --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/isinf.cpp @@ -0,0 +1,127 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "isinf.hpp" +#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/isinf.hpp" + +namespace dpctl::tensor::py_internal +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::unary_contig_impl_fn_ptr_t; +using ew_cmn_ns::unary_strided_impl_fn_ptr_t; + +// U18: ==== ISINF (x) +namespace impl +{ + +namespace isinf_fn_ns = dpctl::tensor::kernels::isinf; + +static unary_contig_impl_fn_ptr_t + isinf_contig_dispatch_vector[td_ns::num_types]; +static int isinf_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + isinf_strided_dispatch_vector[td_ns::num_types]; + +void populate_isinf_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = isinf_fn_ns; + + using fn_ns::IsInfContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(isinf_contig_dispatch_vector); + + using fn_ns::IsInfStridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(isinf_strided_dispatch_vector); + + using fn_ns::IsInfTypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(isinf_output_typeid_vector); +}; + +} // namespace impl + +void init_isinf(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_isinf_dispatch_vectors(); + using impl::isinf_contig_dispatch_vector; + using impl::isinf_output_typeid_vector; + using impl::isinf_strided_dispatch_vector; + + auto isinf_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc( + src, dst, exec_q, depends, isinf_output_typeid_vector, + isinf_contig_dispatch_vector, isinf_strided_dispatch_vector); + }; + m.def("_isinf", isinf_pyapi, "", py::arg("src"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto isinf_result_type_pyapi = [&](const py::dtype &dtype) { + return py_unary_ufunc_result_type(dtype, + isinf_output_typeid_vector); + }; + m.def("_isinf_result_type", isinf_result_type_pyapi); + } +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/isinf.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/isinf.hpp new file mode 100644 index 000000000000..3dec9f20c791 --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/isinf.hpp @@ -0,0 +1,46 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern void init_isinf(py::module_ m); + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/isnan.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/isnan.cpp new file mode 100644 index 000000000000..ce832d0a0ed3 --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/isnan.cpp @@ -0,0 +1,127 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "isnan.hpp" +#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/isnan.hpp" + +namespace dpctl::tensor::py_internal +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::unary_contig_impl_fn_ptr_t; +using ew_cmn_ns::unary_strided_impl_fn_ptr_t; + +// U19: ==== ISNAN (x) +namespace impl +{ + +namespace isnan_fn_ns = dpctl::tensor::kernels::isnan; + +static unary_contig_impl_fn_ptr_t + isnan_contig_dispatch_vector[td_ns::num_types]; +static int isnan_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + isnan_strided_dispatch_vector[td_ns::num_types]; + +void populate_isnan_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = isnan_fn_ns; + + using fn_ns::IsNanContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(isnan_contig_dispatch_vector); + + using fn_ns::IsNanStridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(isnan_strided_dispatch_vector); + + using fn_ns::IsNanTypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(isnan_output_typeid_vector); +}; + +} // namespace impl + +void init_isnan(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_isnan_dispatch_vectors(); + using impl::isnan_contig_dispatch_vector; + using impl::isnan_output_typeid_vector; + using impl::isnan_strided_dispatch_vector; + + auto isnan_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc( + src, dst, exec_q, depends, isnan_output_typeid_vector, + isnan_contig_dispatch_vector, isnan_strided_dispatch_vector); + }; + m.def("_isnan", isnan_pyapi, "", py::arg("src"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto isnan_result_type_pyapi = [&](const py::dtype &dtype) { + return py_unary_ufunc_result_type(dtype, + isnan_output_typeid_vector); + }; + m.def("_isnan_result_type", isnan_result_type_pyapi); + } +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/isnan.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/isnan.hpp new file mode 100644 index 000000000000..d5a8cdae37e8 --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/isnan.hpp @@ -0,0 +1,46 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern void init_isnan(py::module_ m); + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/less.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/less.cpp new file mode 100644 index 000000000000..d587ee713178 --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/less.cpp @@ -0,0 +1,145 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "less.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/less.hpp" + +namespace dpctl::tensor::py_internal +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::binary_contig_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_strided_impl_fn_ptr_t; + +// B13: ===== LESS (x1, x2) +namespace impl +{ +namespace less_fn_ns = dpctl::tensor::kernels::less; + +static binary_contig_impl_fn_ptr_t less_contig_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static int less_output_id_table[td_ns::num_types][td_ns::num_types]; + +static binary_strided_impl_fn_ptr_t + less_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; + +void populate_less_dispatch_tables(void) +{ + using namespace td_ns; + namespace fn_ns = less_fn_ns; + + // which input types are supported, and what is the type of the result + using fn_ns::LessTypeMapFactory; + DispatchTableBuilder dtb1; + dtb1.populate_dispatch_table(less_output_id_table); + + // function pointers for operation on general strided arrays + using fn_ns::LessStridedFactory; + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(less_strided_dispatch_table); + + // function pointers for operation on contiguous inputs and output + using fn_ns::LessContigFactory; + DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table(less_contig_dispatch_table); +}; + +} // namespace impl + +void init_less(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_less_dispatch_tables(); + using impl::less_contig_dispatch_table; + using impl::less_output_id_table; + using impl::less_strided_dispatch_table; + + auto less_pyapi = [&](const arrayT &src1, const arrayT &src2, + const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_binary_ufunc( + src1, src2, dst, exec_q, depends, less_output_id_table, + // function pointers to handle operation on contiguous arrays + // (pointers may be nullptr) + less_contig_dispatch_table, + // function pointers to handle operation on strided arrays (most + // general case) + less_strided_dispatch_table, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{}, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{}); + }; + auto less_result_type_pyapi = [&](const py::dtype &dtype1, + const py::dtype &dtype2) { + return py_binary_ufunc_result_type(dtype1, dtype2, + less_output_id_table); + }; + m.def("_less", less_pyapi, "", py::arg("src1"), py::arg("src2"), + py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + m.def("_less_result_type", less_result_type_pyapi, ""); + } +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/less.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/less.hpp new file mode 100644 index 000000000000..e08d84f380da --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/less.hpp @@ -0,0 +1,46 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern void init_less(py::module_ m); + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/less_equal.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/less_equal.cpp new file mode 100644 index 000000000000..433969cead27 --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/less_equal.cpp @@ -0,0 +1,145 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "less_equal.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/less_equal.hpp" + +namespace dpctl::tensor::py_internal +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::binary_contig_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_strided_impl_fn_ptr_t; + +// B14: ===== LESS_EQUAL (x1, x2) +namespace impl +{ +namespace less_equal_fn_ns = dpctl::tensor::kernels::less_equal; + +static binary_contig_impl_fn_ptr_t + less_equal_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; +static int less_equal_output_id_table[td_ns::num_types][td_ns::num_types]; + +static binary_strided_impl_fn_ptr_t + less_equal_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; + +void populate_less_equal_dispatch_tables(void) +{ + using namespace td_ns; + namespace fn_ns = less_equal_fn_ns; + + // which input types are supported, and what is the type of the result + using fn_ns::LessEqualTypeMapFactory; + DispatchTableBuilder dtb1; + dtb1.populate_dispatch_table(less_equal_output_id_table); + + // function pointers for operation on general strided arrays + using fn_ns::LessEqualStridedFactory; + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(less_equal_strided_dispatch_table); + + // function pointers for operation on contiguous inputs and output + using fn_ns::LessEqualContigFactory; + DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table(less_equal_contig_dispatch_table); +}; + +} // namespace impl + +void init_less_equal(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_less_equal_dispatch_tables(); + using impl::less_equal_contig_dispatch_table; + using impl::less_equal_output_id_table; + using impl::less_equal_strided_dispatch_table; + + auto less_equal_pyapi = [&](const arrayT &src1, const arrayT &src2, + const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_binary_ufunc( + src1, src2, dst, exec_q, depends, less_equal_output_id_table, + // function pointers to handle operation on contiguous arrays + // (pointers may be nullptr) + less_equal_contig_dispatch_table, + // function pointers to handle operation on strided arrays (most + // general case) + less_equal_strided_dispatch_table, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{}, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{}); + }; + auto less_equal_result_type_pyapi = [&](const py::dtype &dtype1, + const py::dtype &dtype2) { + return py_binary_ufunc_result_type(dtype1, dtype2, + less_equal_output_id_table); + }; + m.def("_less_equal", less_equal_pyapi, "", py::arg("src1"), + py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + m.def("_less_equal_result_type", less_equal_result_type_pyapi, ""); + } +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/less_equal.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/less_equal.hpp new file mode 100644 index 000000000000..8eeb837a35a7 --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/less_equal.hpp @@ -0,0 +1,46 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern void init_less_equal(py::module_ m); + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/log.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/log.cpp new file mode 100644 index 000000000000..2906304eaffa --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/log.cpp @@ -0,0 +1,125 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "log.hpp" +#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/log.hpp" + +namespace dpctl::tensor::py_internal +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::unary_contig_impl_fn_ptr_t; +using ew_cmn_ns::unary_strided_impl_fn_ptr_t; + +// U20: ==== LOG (x) +namespace impl +{ + +namespace log_fn_ns = dpctl::tensor::kernels::log; + +static unary_contig_impl_fn_ptr_t log_contig_dispatch_vector[td_ns::num_types]; +static int log_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + log_strided_dispatch_vector[td_ns::num_types]; + +void populate_log_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = log_fn_ns; + + using fn_ns::LogContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(log_contig_dispatch_vector); + + using fn_ns::LogStridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(log_strided_dispatch_vector); + + using fn_ns::LogTypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(log_output_typeid_vector); +}; + +} // namespace impl + +void init_log(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_log_dispatch_vectors(); + using impl::log_contig_dispatch_vector; + using impl::log_output_typeid_vector; + using impl::log_strided_dispatch_vector; + + auto log_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc( + src, dst, exec_q, depends, log_output_typeid_vector, + log_contig_dispatch_vector, log_strided_dispatch_vector); + }; + m.def("_log", log_pyapi, "", py::arg("src"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto log_result_type_pyapi = [&](const py::dtype &dtype) { + return py_unary_ufunc_result_type(dtype, log_output_typeid_vector); + }; + m.def("_log_result_type", log_result_type_pyapi); + } +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/log.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/log.hpp new file mode 100644 index 000000000000..fb065e82e037 --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/log.hpp @@ -0,0 +1,46 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern void init_log(py::module_ m); + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/log10.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/log10.cpp new file mode 100644 index 000000000000..9501af987341 --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/log10.cpp @@ -0,0 +1,127 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "log10.hpp" +#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/log10.hpp" + +namespace dpctl::tensor::py_internal +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::unary_contig_impl_fn_ptr_t; +using ew_cmn_ns::unary_strided_impl_fn_ptr_t; + +// U23: ==== LOG10 (x) +namespace impl +{ + +namespace log10_fn_ns = dpctl::tensor::kernels::log10; + +static unary_contig_impl_fn_ptr_t + log10_contig_dispatch_vector[td_ns::num_types]; +static int log10_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + log10_strided_dispatch_vector[td_ns::num_types]; + +void populate_log10_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = log10_fn_ns; + + using fn_ns::Log10ContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(log10_contig_dispatch_vector); + + using fn_ns::Log10StridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(log10_strided_dispatch_vector); + + using fn_ns::Log10TypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(log10_output_typeid_vector); +}; + +} // namespace impl + +void init_log10(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_log10_dispatch_vectors(); + using impl::log10_contig_dispatch_vector; + using impl::log10_output_typeid_vector; + using impl::log10_strided_dispatch_vector; + + auto log10_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc( + src, dst, exec_q, depends, log10_output_typeid_vector, + log10_contig_dispatch_vector, log10_strided_dispatch_vector); + }; + m.def("_log10", log10_pyapi, "", py::arg("src"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto log10_result_type_pyapi = [&](const py::dtype &dtype) { + return py_unary_ufunc_result_type(dtype, + log10_output_typeid_vector); + }; + m.def("_log10_result_type", log10_result_type_pyapi); + } +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/log10.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/log10.hpp new file mode 100644 index 000000000000..779b15472462 --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/log10.hpp @@ -0,0 +1,46 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern void init_log10(py::module_ m); + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/log1p.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/log1p.cpp new file mode 100644 index 000000000000..c94b3f3b5d7d --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/log1p.cpp @@ -0,0 +1,127 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "log1p.hpp" +#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/log1p.hpp" + +namespace dpctl::tensor::py_internal +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::unary_contig_impl_fn_ptr_t; +using ew_cmn_ns::unary_strided_impl_fn_ptr_t; + +// U21: ==== LOG1P (x) +namespace impl +{ + +namespace log1p_fn_ns = dpctl::tensor::kernels::log1p; + +static unary_contig_impl_fn_ptr_t + log1p_contig_dispatch_vector[td_ns::num_types]; +static int log1p_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + log1p_strided_dispatch_vector[td_ns::num_types]; + +void populate_log1p_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = log1p_fn_ns; + + using fn_ns::Log1pContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(log1p_contig_dispatch_vector); + + using fn_ns::Log1pStridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(log1p_strided_dispatch_vector); + + using fn_ns::Log1pTypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(log1p_output_typeid_vector); +}; + +} // namespace impl + +void init_log1p(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_log1p_dispatch_vectors(); + using impl::log1p_contig_dispatch_vector; + using impl::log1p_output_typeid_vector; + using impl::log1p_strided_dispatch_vector; + + auto log1p_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc( + src, dst, exec_q, depends, log1p_output_typeid_vector, + log1p_contig_dispatch_vector, log1p_strided_dispatch_vector); + }; + m.def("_log1p", log1p_pyapi, "", py::arg("src"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto log1p_result_type_pyapi = [&](const py::dtype &dtype) { + return py_unary_ufunc_result_type(dtype, + log1p_output_typeid_vector); + }; + m.def("_log1p_result_type", log1p_result_type_pyapi); + } +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/log1p.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/log1p.hpp new file mode 100644 index 000000000000..85bf21c8ea48 --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/log1p.hpp @@ -0,0 +1,46 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern void init_log1p(py::module_ m); + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/log2.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/log2.cpp new file mode 100644 index 000000000000..825d516f7820 --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/log2.cpp @@ -0,0 +1,125 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "log2.hpp" +#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/log2.hpp" + +namespace dpctl::tensor::py_internal +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::unary_contig_impl_fn_ptr_t; +using ew_cmn_ns::unary_strided_impl_fn_ptr_t; + +// U22: ==== LOG2 (x) +namespace impl +{ + +namespace log2_fn_ns = dpctl::tensor::kernels::log2; + +static unary_contig_impl_fn_ptr_t log2_contig_dispatch_vector[td_ns::num_types]; +static int log2_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + log2_strided_dispatch_vector[td_ns::num_types]; + +void populate_log2_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = log2_fn_ns; + + using fn_ns::Log2ContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(log2_contig_dispatch_vector); + + using fn_ns::Log2StridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(log2_strided_dispatch_vector); + + using fn_ns::Log2TypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(log2_output_typeid_vector); +}; + +} // namespace impl + +void init_log2(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_log2_dispatch_vectors(); + using impl::log2_contig_dispatch_vector; + using impl::log2_output_typeid_vector; + using impl::log2_strided_dispatch_vector; + + auto log2_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc( + src, dst, exec_q, depends, log2_output_typeid_vector, + log2_contig_dispatch_vector, log2_strided_dispatch_vector); + }; + m.def("_log2", log2_pyapi, "", py::arg("src"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto log2_result_type_pyapi = [&](const py::dtype &dtype) { + return py_unary_ufunc_result_type(dtype, log2_output_typeid_vector); + }; + m.def("_log2_result_type", log2_result_type_pyapi); + } +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/log2.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/log2.hpp new file mode 100644 index 000000000000..11f757b1449d --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/log2.hpp @@ -0,0 +1,46 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern void init_log2(py::module_ m); + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/logaddexp.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/logaddexp.cpp new file mode 100644 index 000000000000..71bc9cad4035 --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/logaddexp.cpp @@ -0,0 +1,145 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "logaddexp.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/logaddexp.hpp" + +namespace dpctl::tensor::py_internal +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::binary_contig_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_strided_impl_fn_ptr_t; + +// B15: ===== LOGADDEXP (x1, x2) +namespace impl +{ +namespace logaddexp_fn_ns = dpctl::tensor::kernels::logaddexp; + +static binary_contig_impl_fn_ptr_t + logaddexp_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; +static int logaddexp_output_id_table[td_ns::num_types][td_ns::num_types]; + +static binary_strided_impl_fn_ptr_t + logaddexp_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; + +void populate_logaddexp_dispatch_tables(void) +{ + using namespace td_ns; + namespace fn_ns = logaddexp_fn_ns; + + // which input types are supported, and what is the type of the result + using fn_ns::LogAddExpTypeMapFactory; + DispatchTableBuilder dtb1; + dtb1.populate_dispatch_table(logaddexp_output_id_table); + + // function pointers for operation on general strided arrays + using fn_ns::LogAddExpStridedFactory; + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(logaddexp_strided_dispatch_table); + + // function pointers for operation on contiguous inputs and output + using fn_ns::LogAddExpContigFactory; + DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table(logaddexp_contig_dispatch_table); +}; + +} // namespace impl + +void init_logaddexp(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_logaddexp_dispatch_tables(); + using impl::logaddexp_contig_dispatch_table; + using impl::logaddexp_output_id_table; + using impl::logaddexp_strided_dispatch_table; + + auto logaddexp_pyapi = [&](const arrayT &src1, const arrayT &src2, + const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_binary_ufunc( + src1, src2, dst, exec_q, depends, logaddexp_output_id_table, + // function pointers to handle operation on contiguous arrays + // (pointers may be nullptr) + logaddexp_contig_dispatch_table, + // function pointers to handle operation on strided arrays (most + // general case) + logaddexp_strided_dispatch_table, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{}, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{}); + }; + auto logaddexp_result_type_pyapi = [&](const py::dtype &dtype1, + const py::dtype &dtype2) { + return py_binary_ufunc_result_type(dtype1, dtype2, + logaddexp_output_id_table); + }; + m.def("_logaddexp", logaddexp_pyapi, "", py::arg("src1"), + py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + m.def("_logaddexp_result_type", logaddexp_result_type_pyapi, ""); + } +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/logaddexp.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/logaddexp.hpp new file mode 100644 index 000000000000..2c4efa7d0c56 --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/logaddexp.hpp @@ -0,0 +1,46 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern void init_logaddexp(py::module_ m); + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/logical_and.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/logical_and.cpp new file mode 100644 index 000000000000..90c0b52a6aa2 --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/logical_and.cpp @@ -0,0 +1,146 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "logical_and.hpp" +#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/logical_and.hpp" + +namespace dpctl::tensor::py_internal +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::binary_contig_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_strided_impl_fn_ptr_t; + +// B16: ===== LOGICAL_AND (x1, x2) +namespace impl +{ +namespace logical_and_fn_ns = dpctl::tensor::kernels::logical_and; + +static binary_contig_impl_fn_ptr_t + logical_and_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; +static int logical_and_output_id_table[td_ns::num_types][td_ns::num_types]; + +static binary_strided_impl_fn_ptr_t + logical_and_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; + +void populate_logical_and_dispatch_tables(void) +{ + using namespace td_ns; + namespace fn_ns = logical_and_fn_ns; + + // which input types are supported, and what is the type of the result + using fn_ns::LogicalAndTypeMapFactory; + DispatchTableBuilder dtb1; + dtb1.populate_dispatch_table(logical_and_output_id_table); + + // function pointers for operation on general strided arrays + using fn_ns::LogicalAndStridedFactory; + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(logical_and_strided_dispatch_table); + + // function pointers for operation on contiguous inputs and output + using fn_ns::LogicalAndContigFactory; + DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table(logical_and_contig_dispatch_table); +}; + +} // namespace impl + +void init_logical_and(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_logical_and_dispatch_tables(); + using impl::logical_and_contig_dispatch_table; + using impl::logical_and_output_id_table; + using impl::logical_and_strided_dispatch_table; + + auto logical_and_pyapi = [&](const arrayT &src1, const arrayT &src2, + const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_binary_ufunc( + src1, src2, dst, exec_q, depends, logical_and_output_id_table, + // function pointers to handle operation on contiguous arrays + // (pointers may be nullptr) + logical_and_contig_dispatch_table, + // function pointers to handle operation on strided arrays (most + // general case) + logical_and_strided_dispatch_table, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{}, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{}); + }; + auto logical_and_result_type_pyapi = [&](const py::dtype &dtype1, + const py::dtype &dtype2) { + return py_binary_ufunc_result_type(dtype1, dtype2, + logical_and_output_id_table); + }; + m.def("_logical_and", logical_and_pyapi, "", py::arg("src1"), + py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + m.def("_logical_and_result_type", logical_and_result_type_pyapi, ""); + } +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/logical_and.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/logical_and.hpp new file mode 100644 index 000000000000..c22a98f24146 --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/logical_and.hpp @@ -0,0 +1,46 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern void init_logical_and(py::module_ m); + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/logical_not.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/logical_not.cpp new file mode 100644 index 000000000000..e8f5845fac16 --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/logical_not.cpp @@ -0,0 +1,129 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "logical_not.hpp" +#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/logical_not.hpp" + +namespace dpctl::tensor::py_internal +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::unary_contig_impl_fn_ptr_t; +using ew_cmn_ns::unary_strided_impl_fn_ptr_t; + +// U24: ==== LOGICAL_NOT (x) +namespace impl +{ + +namespace logical_not_fn_ns = dpctl::tensor::kernels::logical_not; + +static unary_contig_impl_fn_ptr_t + logical_not_contig_dispatch_vector[td_ns::num_types]; +static int logical_not_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + logical_not_strided_dispatch_vector[td_ns::num_types]; + +void populate_logical_not_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = logical_not_fn_ns; + + using fn_ns::LogicalNotContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(logical_not_contig_dispatch_vector); + + using fn_ns::LogicalNotStridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(logical_not_strided_dispatch_vector); + + using fn_ns::LogicalNotTypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(logical_not_output_typeid_vector); +}; + +} // namespace impl + +void init_logical_not(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_logical_not_dispatch_vectors(); + using impl::logical_not_contig_dispatch_vector; + using impl::logical_not_output_typeid_vector; + using impl::logical_not_strided_dispatch_vector; + + auto logical_not_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc(src, dst, exec_q, depends, + logical_not_output_typeid_vector, + logical_not_contig_dispatch_vector, + logical_not_strided_dispatch_vector); + }; + m.def("_logical_not", logical_not_pyapi, "", py::arg("src"), + py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + + auto logical_not_result_type_pyapi = [&](const py::dtype &dtype) { + return py_unary_ufunc_result_type(dtype, + logical_not_output_typeid_vector); + }; + m.def("_logical_not_result_type", logical_not_result_type_pyapi); + } +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/logical_not.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/logical_not.hpp new file mode 100644 index 000000000000..f3bb79cc28cc --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/logical_not.hpp @@ -0,0 +1,46 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern void init_logical_not(py::module_ m); + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/logical_or.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/logical_or.cpp new file mode 100644 index 000000000000..38c981792345 --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/logical_or.cpp @@ -0,0 +1,146 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "logical_or.hpp" +#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/logical_or.hpp" + +namespace dpctl::tensor::py_internal +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::binary_contig_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_strided_impl_fn_ptr_t; + +// B17: ===== LOGICAL_OR (x1, x2) +namespace impl +{ +namespace logical_or_fn_ns = dpctl::tensor::kernels::logical_or; + +static binary_contig_impl_fn_ptr_t + logical_or_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; +static int logical_or_output_id_table[td_ns::num_types][td_ns::num_types]; + +static binary_strided_impl_fn_ptr_t + logical_or_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; + +void populate_logical_or_dispatch_tables(void) +{ + using namespace td_ns; + namespace fn_ns = logical_or_fn_ns; + + // which input types are supported, and what is the type of the result + using fn_ns::LogicalOrTypeMapFactory; + DispatchTableBuilder dtb1; + dtb1.populate_dispatch_table(logical_or_output_id_table); + + // function pointers for operation on general strided arrays + using fn_ns::LogicalOrStridedFactory; + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(logical_or_strided_dispatch_table); + + // function pointers for operation on contiguous inputs and output + using fn_ns::LogicalOrContigFactory; + DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table(logical_or_contig_dispatch_table); +}; + +} // namespace impl + +void init_logical_or(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_logical_or_dispatch_tables(); + using impl::logical_or_contig_dispatch_table; + using impl::logical_or_output_id_table; + using impl::logical_or_strided_dispatch_table; + + auto logical_or_pyapi = [&](const arrayT &src1, const arrayT &src2, + const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_binary_ufunc( + src1, src2, dst, exec_q, depends, logical_or_output_id_table, + // function pointers to handle operation on contiguous arrays + // (pointers may be nullptr) + logical_or_contig_dispatch_table, + // function pointers to handle operation on strided arrays (most + // general case) + logical_or_strided_dispatch_table, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{}, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{}); + }; + auto logical_or_result_type_pyapi = [&](const py::dtype &dtype1, + const py::dtype &dtype2) { + return py_binary_ufunc_result_type(dtype1, dtype2, + logical_or_output_id_table); + }; + m.def("_logical_or", logical_or_pyapi, "", py::arg("src1"), + py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + m.def("_logical_or_result_type", logical_or_result_type_pyapi, ""); + } +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/logical_or.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/logical_or.hpp new file mode 100644 index 000000000000..11e83fe8cedf --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/logical_or.hpp @@ -0,0 +1,46 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern void init_logical_or(py::module_ m); + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/logical_xor.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/logical_xor.cpp new file mode 100644 index 000000000000..759133ca6120 --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/logical_xor.cpp @@ -0,0 +1,146 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "logical_xor.hpp" +#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/logical_xor.hpp" + +namespace dpctl::tensor::py_internal +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::binary_contig_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_strided_impl_fn_ptr_t; + +// B18: ===== LOGICAL_XOR (x1, x2) +namespace impl +{ +namespace logical_xor_fn_ns = dpctl::tensor::kernels::logical_xor; + +static binary_contig_impl_fn_ptr_t + logical_xor_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; +static int logical_xor_output_id_table[td_ns::num_types][td_ns::num_types]; + +static binary_strided_impl_fn_ptr_t + logical_xor_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; + +void populate_logical_xor_dispatch_tables(void) +{ + using namespace td_ns; + namespace fn_ns = logical_xor_fn_ns; + + // which input types are supported, and what is the type of the result + using fn_ns::LogicalXorTypeMapFactory; + DispatchTableBuilder dtb1; + dtb1.populate_dispatch_table(logical_xor_output_id_table); + + // function pointers for operation on general strided arrays + using fn_ns::LogicalXorStridedFactory; + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(logical_xor_strided_dispatch_table); + + // function pointers for operation on contiguous inputs and output + using fn_ns::LogicalXorContigFactory; + DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table(logical_xor_contig_dispatch_table); +}; + +} // namespace impl + +void init_logical_xor(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_logical_xor_dispatch_tables(); + using impl::logical_xor_contig_dispatch_table; + using impl::logical_xor_output_id_table; + using impl::logical_xor_strided_dispatch_table; + + auto logical_xor_pyapi = [&](const arrayT &src1, const arrayT &src2, + const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_binary_ufunc( + src1, src2, dst, exec_q, depends, logical_xor_output_id_table, + // function pointers to handle operation on contiguous arrays + // (pointers may be nullptr) + logical_xor_contig_dispatch_table, + // function pointers to handle operation on strided arrays (most + // general case) + logical_xor_strided_dispatch_table, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{}, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{}); + }; + auto logical_xor_result_type_pyapi = [&](const py::dtype &dtype1, + const py::dtype &dtype2) { + return py_binary_ufunc_result_type(dtype1, dtype2, + logical_xor_output_id_table); + }; + m.def("_logical_xor", logical_xor_pyapi, "", py::arg("src1"), + py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + m.def("_logical_xor_result_type", logical_xor_result_type_pyapi, ""); + } +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/logical_xor.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/logical_xor.hpp new file mode 100644 index 000000000000..24c163249128 --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/logical_xor.hpp @@ -0,0 +1,46 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern void init_logical_xor(py::module_ m); + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/maximum.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/maximum.cpp new file mode 100644 index 000000000000..8fda65c43dca --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/maximum.cpp @@ -0,0 +1,146 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "maximum.hpp" +#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/maximum.hpp" + +namespace dpctl::tensor::py_internal +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::binary_contig_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_strided_impl_fn_ptr_t; + +// B26: ===== MAXIMUM (x1, x2) +namespace impl +{ +namespace maximum_fn_ns = dpctl::tensor::kernels::maximum; + +static binary_contig_impl_fn_ptr_t + maximum_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; +static int maximum_output_id_table[td_ns::num_types][td_ns::num_types]; + +static binary_strided_impl_fn_ptr_t + maximum_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; + +void populate_maximum_dispatch_tables(void) +{ + using namespace td_ns; + namespace fn_ns = maximum_fn_ns; + + // which input types are supported, and what is the type of the result + using fn_ns::MaximumTypeMapFactory; + DispatchTableBuilder dtb1; + dtb1.populate_dispatch_table(maximum_output_id_table); + + // function pointers for operation on general strided arrays + using fn_ns::MaximumStridedFactory; + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(maximum_strided_dispatch_table); + + // function pointers for operation on contiguous inputs and output + using fn_ns::MaximumContigFactory; + DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table(maximum_contig_dispatch_table); +}; + +} // namespace impl + +void init_maximum(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_maximum_dispatch_tables(); + using impl::maximum_contig_dispatch_table; + using impl::maximum_output_id_table; + using impl::maximum_strided_dispatch_table; + + auto maximum_pyapi = [&](const arrayT &src1, const arrayT &src2, + const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_binary_ufunc( + src1, src2, dst, exec_q, depends, maximum_output_id_table, + // function pointers to handle operation on contiguous arrays + // (pointers may be nullptr) + maximum_contig_dispatch_table, + // function pointers to handle operation on strided arrays (most + // general case) + maximum_strided_dispatch_table, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{}, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{}); + }; + auto maximum_result_type_pyapi = [&](const py::dtype &dtype1, + const py::dtype &dtype2) { + return py_binary_ufunc_result_type(dtype1, dtype2, + maximum_output_id_table); + }; + m.def("_maximum", maximum_pyapi, "", py::arg("src1"), py::arg("src2"), + py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + m.def("_maximum_result_type", maximum_result_type_pyapi, ""); + } +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/maximum.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/maximum.hpp new file mode 100644 index 000000000000..1f8fc027ac1d --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/maximum.hpp @@ -0,0 +1,46 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern void init_maximum(py::module_ m); + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/minimum.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/minimum.cpp new file mode 100644 index 000000000000..7055ce5c72f5 --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/minimum.cpp @@ -0,0 +1,146 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "minimum.hpp" +#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/minimum.hpp" + +namespace dpctl::tensor::py_internal +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::binary_contig_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_strided_impl_fn_ptr_t; + +// B27: ===== MINIMUM (x1, x2) +namespace impl +{ +namespace minimum_fn_ns = dpctl::tensor::kernels::minimum; + +static binary_contig_impl_fn_ptr_t + minimum_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; +static int minimum_output_id_table[td_ns::num_types][td_ns::num_types]; + +static binary_strided_impl_fn_ptr_t + minimum_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; + +void populate_minimum_dispatch_tables(void) +{ + using namespace td_ns; + namespace fn_ns = minimum_fn_ns; + + // which input types are supported, and what is the type of the result + using fn_ns::MinimumTypeMapFactory; + DispatchTableBuilder dtb1; + dtb1.populate_dispatch_table(minimum_output_id_table); + + // function pointers for operation on general strided arrays + using fn_ns::MinimumStridedFactory; + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(minimum_strided_dispatch_table); + + // function pointers for operation on contiguous inputs and output + using fn_ns::MinimumContigFactory; + DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table(minimum_contig_dispatch_table); +}; + +} // namespace impl + +void init_minimum(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_minimum_dispatch_tables(); + using impl::minimum_contig_dispatch_table; + using impl::minimum_output_id_table; + using impl::minimum_strided_dispatch_table; + + auto minimum_pyapi = [&](const arrayT &src1, const arrayT &src2, + const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_binary_ufunc( + src1, src2, dst, exec_q, depends, minimum_output_id_table, + // function pointers to handle operation on contiguous arrays + // (pointers may be nullptr) + minimum_contig_dispatch_table, + // function pointers to handle operation on strided arrays (most + // general case) + minimum_strided_dispatch_table, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{}, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{}); + }; + auto minimum_result_type_pyapi = [&](const py::dtype &dtype1, + const py::dtype &dtype2) { + return py_binary_ufunc_result_type(dtype1, dtype2, + minimum_output_id_table); + }; + m.def("_minimum", minimum_pyapi, "", py::arg("src1"), py::arg("src2"), + py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + m.def("_minimum_result_type", minimum_result_type_pyapi, ""); + } +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/minimum.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/minimum.hpp new file mode 100644 index 000000000000..be2e18a9b37c --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/minimum.hpp @@ -0,0 +1,46 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern void init_minimum(py::module_ m); + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/multiply.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/multiply.cpp new file mode 100644 index 000000000000..5d25f8cc7b19 --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/multiply.cpp @@ -0,0 +1,244 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "multiply.hpp" +#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/common_inplace.hpp" +#include "kernels/elementwise_functions/multiply.hpp" + +namespace dpctl::tensor::py_internal +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::binary_contig_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_strided_impl_fn_ptr_t; + +using ew_cmn_ns::binary_inplace_contig_impl_fn_ptr_t; +using ew_cmn_ns::binary_inplace_row_matrix_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_inplace_strided_impl_fn_ptr_t; + +// B19: ===== MULTIPLY (x1, x2) +namespace impl +{ + +namespace multiply_fn_ns = dpctl::tensor::kernels::multiply; + +static binary_contig_impl_fn_ptr_t + multiply_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; + +static int multiply_output_id_table[td_ns::num_types][td_ns::num_types]; +static int multiply_inplace_output_id_table[td_ns::num_types][td_ns::num_types]; + +static binary_strided_impl_fn_ptr_t + multiply_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; + +// mul(matrix, row) +static binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t + multiply_contig_matrix_contig_row_broadcast_dispatch_table + [td_ns::num_types][td_ns::num_types]; + +// mul(row, matrix) +static binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t + multiply_contig_row_contig_matrix_broadcast_dispatch_table + [td_ns::num_types][td_ns::num_types]; + +static binary_inplace_contig_impl_fn_ptr_t + multiply_inplace_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; +static binary_inplace_strided_impl_fn_ptr_t + multiply_inplace_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; +static binary_inplace_row_matrix_broadcast_impl_fn_ptr_t + multiply_inplace_row_matrix_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +void populate_multiply_dispatch_tables(void) +{ + using namespace td_ns; + namespace fn_ns = multiply_fn_ns; + + // which input types are supported, and what is the type of the result + using fn_ns::MultiplyTypeMapFactory; + DispatchTableBuilder dtb1; + dtb1.populate_dispatch_table(multiply_output_id_table); + + // function pointers for operation on general strided arrays + using fn_ns::MultiplyStridedFactory; + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(multiply_strided_dispatch_table); + + // function pointers for operation on contiguous inputs and output + using fn_ns::MultiplyContigFactory; + DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table(multiply_contig_dispatch_table); + + // function pointers for operation on contiguous matrix, contiguous row + // with contiguous matrix output + using fn_ns::MultiplyContigMatrixContigRowBroadcastFactory; + DispatchTableBuilder< + binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t, + MultiplyContigMatrixContigRowBroadcastFactory, num_types> + dtb4; + dtb4.populate_dispatch_table( + multiply_contig_matrix_contig_row_broadcast_dispatch_table); + + // function pointers for operation on contiguous row, contiguous matrix + // with contiguous matrix output + using fn_ns::MultiplyContigRowContigMatrixBroadcastFactory; + DispatchTableBuilder< + binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t, + MultiplyContigRowContigMatrixBroadcastFactory, num_types> + dtb5; + dtb5.populate_dispatch_table( + multiply_contig_row_contig_matrix_broadcast_dispatch_table); + + // function pointers for inplace operation on general strided arrays + using fn_ns::MultiplyInplaceStridedFactory; + DispatchTableBuilder + dtb6; + dtb6.populate_dispatch_table(multiply_inplace_strided_dispatch_table); + + // function pointers for inplace operation on contiguous inputs and output + using fn_ns::MultiplyInplaceContigFactory; + DispatchTableBuilder + dtb7; + dtb7.populate_dispatch_table(multiply_inplace_contig_dispatch_table); + + // function pointers for inplace operation on contiguous matrix + // and contiguous row + using fn_ns::MultiplyInplaceRowMatrixBroadcastFactory; + DispatchTableBuilder + dtb8; + dtb8.populate_dispatch_table(multiply_inplace_row_matrix_dispatch_table); + + // which types are supported by the in-place kernels + using fn_ns::MultiplyInplaceTypeMapFactory; + DispatchTableBuilder dtb9; + dtb9.populate_dispatch_table(multiply_inplace_output_id_table); +}; + +} // namespace impl + +void init_multiply(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_multiply_dispatch_tables(); + using impl::multiply_contig_dispatch_table; + using impl::multiply_contig_matrix_contig_row_broadcast_dispatch_table; + using impl::multiply_contig_row_contig_matrix_broadcast_dispatch_table; + using impl::multiply_output_id_table; + using impl::multiply_strided_dispatch_table; + + auto multiply_pyapi = [&](const arrayT &src1, const arrayT &src2, + const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_binary_ufunc( + src1, src2, dst, exec_q, depends, multiply_output_id_table, + // function pointers to handle operation on contiguous + // arrays (pointers may be nullptr) + multiply_contig_dispatch_table, + // function pointers to handle operation on strided arrays + // (most general case) + multiply_strided_dispatch_table, + // function pointers to handle operation of c-contig matrix + // and c-contig row with broadcasting (may be nullptr) + multiply_contig_matrix_contig_row_broadcast_dispatch_table, + // function pointers to handle operation of c-contig matrix + // and c-contig row with broadcasting (may be nullptr) + multiply_contig_row_contig_matrix_broadcast_dispatch_table); + }; + auto multiply_result_type_pyapi = [&](const py::dtype &dtype1, + const py::dtype &dtype2) { + return py_binary_ufunc_result_type(dtype1, dtype2, + multiply_output_id_table); + }; + m.def("_multiply", multiply_pyapi, "", py::arg("src1"), py::arg("src2"), + py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + m.def("_multiply_result_type", multiply_result_type_pyapi, ""); + + using impl::multiply_inplace_contig_dispatch_table; + using impl::multiply_inplace_output_id_table; + using impl::multiply_inplace_row_matrix_dispatch_table; + using impl::multiply_inplace_strided_dispatch_table; + + auto multiply_inplace_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_binary_inplace_ufunc( + src, dst, exec_q, depends, multiply_inplace_output_id_table, + // function pointers to handle inplace operation on + // contiguous arrays (pointers may be nullptr) + multiply_inplace_contig_dispatch_table, + // function pointers to handle inplace operation on strided + // arrays (most general case) + multiply_inplace_strided_dispatch_table, + // function pointers to handle inplace operation on + // c-contig matrix with c-contig row with broadcasting + // (may be nullptr) + multiply_inplace_row_matrix_dispatch_table); + }; + m.def("_multiply_inplace", multiply_inplace_pyapi, "", py::arg("lhs"), + py::arg("rhs"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + } +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/multiply.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/multiply.hpp new file mode 100644 index 000000000000..a4ed946a8501 --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/multiply.hpp @@ -0,0 +1,46 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern void init_multiply(py::module_ m); + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/negative.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/negative.cpp new file mode 100644 index 000000000000..8510a15eab00 --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/negative.cpp @@ -0,0 +1,128 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "negative.hpp" +#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/negative.hpp" + +namespace dpctl::tensor::py_internal +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::unary_contig_impl_fn_ptr_t; +using ew_cmn_ns::unary_strided_impl_fn_ptr_t; + +// U25: ==== NEGATIVE (x) +namespace impl +{ + +namespace negative_fn_ns = dpctl::tensor::kernels::negative; + +static unary_contig_impl_fn_ptr_t + negative_contig_dispatch_vector[td_ns::num_types]; +static int negative_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + negative_strided_dispatch_vector[td_ns::num_types]; + +void populate_negative_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = negative_fn_ns; + + using fn_ns::NegativeContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(negative_contig_dispatch_vector); + + using fn_ns::NegativeStridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(negative_strided_dispatch_vector); + + using fn_ns::NegativeTypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(negative_output_typeid_vector); +}; + +} // namespace impl + +void init_negative(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_negative_dispatch_vectors(); + using impl::negative_contig_dispatch_vector; + using impl::negative_output_typeid_vector; + using impl::negative_strided_dispatch_vector; + + auto negative_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc(src, dst, exec_q, depends, + negative_output_typeid_vector, + negative_contig_dispatch_vector, + negative_strided_dispatch_vector); + }; + m.def("_negative", negative_pyapi, "", py::arg("src"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto negative_result_type_pyapi = [&](const py::dtype &dtype) { + return py_unary_ufunc_result_type(dtype, + negative_output_typeid_vector); + }; + m.def("_negative_result_type", negative_result_type_pyapi); + } +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/negative.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/negative.hpp new file mode 100644 index 000000000000..083df516b435 --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/negative.hpp @@ -0,0 +1,46 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern void init_negative(py::module_ m); + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/nextafter.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/nextafter.cpp new file mode 100644 index 000000000000..42e1ac9bd4c3 --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/nextafter.cpp @@ -0,0 +1,146 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "nextafter.hpp" +#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/nextafter.hpp" + +namespace dpctl::tensor::py_internal +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::binary_contig_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_strided_impl_fn_ptr_t; + +// B28: ===== NEXTAFTER (x1, x2) +namespace impl +{ +namespace nextafter_fn_ns = dpctl::tensor::kernels::nextafter; + +static binary_contig_impl_fn_ptr_t + nextafter_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; +static int nextafter_output_id_table[td_ns::num_types][td_ns::num_types]; + +static binary_strided_impl_fn_ptr_t + nextafter_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; + +void populate_nextafter_dispatch_tables(void) +{ + using namespace td_ns; + namespace fn_ns = nextafter_fn_ns; + + // which input types are supported, and what is the type of the result + using fn_ns::NextafterTypeMapFactory; + DispatchTableBuilder dtb1; + dtb1.populate_dispatch_table(nextafter_output_id_table); + + // function pointers for operation on general strided arrays + using fn_ns::NextafterStridedFactory; + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(nextafter_strided_dispatch_table); + + // function pointers for operation on contiguous inputs and output + using fn_ns::NextafterContigFactory; + DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table(nextafter_contig_dispatch_table); +}; + +} // namespace impl + +void init_nextafter(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_nextafter_dispatch_tables(); + using impl::nextafter_contig_dispatch_table; + using impl::nextafter_output_id_table; + using impl::nextafter_strided_dispatch_table; + + auto nextafter_pyapi = [&](const arrayT &src1, const arrayT &src2, + const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_binary_ufunc( + src1, src2, dst, exec_q, depends, nextafter_output_id_table, + // function pointers to handle operation on contiguous arrays + // (pointers may be nullptr) + nextafter_contig_dispatch_table, + // function pointers to handle operation on strided arrays (most + // general case) + nextafter_strided_dispatch_table, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{}, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{}); + }; + auto nextafter_result_type_pyapi = [&](const py::dtype &dtype1, + const py::dtype &dtype2) { + return py_binary_ufunc_result_type(dtype1, dtype2, + nextafter_output_id_table); + }; + m.def("_nextafter", nextafter_pyapi, "", py::arg("src1"), + py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + m.def("_nextafter_result_type", nextafter_result_type_pyapi, ""); + } +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/nextafter.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/nextafter.hpp new file mode 100644 index 000000000000..76ad701d4012 --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/nextafter.hpp @@ -0,0 +1,46 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern void init_nextafter(py::module_ m); + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/not_equal.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/not_equal.cpp new file mode 100644 index 000000000000..dcbbf0cf015e --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/not_equal.cpp @@ -0,0 +1,146 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "not_equal.hpp" +#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/not_equal.hpp" + +namespace dpctl::tensor::py_internal +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::binary_contig_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_strided_impl_fn_ptr_t; + +// B20: ===== NOT_EQUAL (x1, x2) +namespace impl +{ +namespace not_equal_fn_ns = dpctl::tensor::kernels::not_equal; + +static binary_contig_impl_fn_ptr_t + not_equal_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; +static int not_equal_output_id_table[td_ns::num_types][td_ns::num_types]; + +static binary_strided_impl_fn_ptr_t + not_equal_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; + +void populate_not_equal_dispatch_tables(void) +{ + using namespace td_ns; + namespace fn_ns = not_equal_fn_ns; + + // which input types are supported, and what is the type of the result + using fn_ns::NotEqualTypeMapFactory; + DispatchTableBuilder dtb1; + dtb1.populate_dispatch_table(not_equal_output_id_table); + + // function pointers for operation on general strided arrays + using fn_ns::NotEqualStridedFactory; + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(not_equal_strided_dispatch_table); + + // function pointers for operation on contiguous inputs and output + using fn_ns::NotEqualContigFactory; + DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table(not_equal_contig_dispatch_table); +}; + +} // namespace impl + +void init_not_equal(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_not_equal_dispatch_tables(); + using impl::not_equal_contig_dispatch_table; + using impl::not_equal_output_id_table; + using impl::not_equal_strided_dispatch_table; + + auto not_equal_pyapi = [&](const arrayT &src1, const arrayT &src2, + const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_binary_ufunc( + src1, src2, dst, exec_q, depends, not_equal_output_id_table, + // function pointers to handle operation on contiguous arrays + // (pointers may be nullptr) + not_equal_contig_dispatch_table, + // function pointers to handle operation on strided arrays (most + // general case) + not_equal_strided_dispatch_table, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{}, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{}); + }; + auto not_equal_result_type_pyapi = [&](const py::dtype &dtype1, + const py::dtype &dtype2) { + return py_binary_ufunc_result_type(dtype1, dtype2, + not_equal_output_id_table); + }; + m.def("_not_equal", not_equal_pyapi, "", py::arg("src1"), + py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + m.def("_not_equal_result_type", not_equal_result_type_pyapi, ""); + } +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/not_equal.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/not_equal.hpp new file mode 100644 index 000000000000..c6c99bb793bc --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/not_equal.hpp @@ -0,0 +1,46 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern void init_not_equal(py::module_ m); + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/positive.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/positive.cpp new file mode 100644 index 000000000000..6518b10a77c0 --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/positive.cpp @@ -0,0 +1,128 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "positive.hpp" +#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/positive.hpp" + +namespace dpctl::tensor::py_internal +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::unary_contig_impl_fn_ptr_t; +using ew_cmn_ns::unary_strided_impl_fn_ptr_t; + +// U26: ==== POSITIVE (x) +namespace impl +{ + +namespace positive_fn_ns = dpctl::tensor::kernels::positive; + +static unary_contig_impl_fn_ptr_t + positive_contig_dispatch_vector[td_ns::num_types]; +static int positive_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + positive_strided_dispatch_vector[td_ns::num_types]; + +void populate_positive_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = positive_fn_ns; + + using fn_ns::PositiveContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(positive_contig_dispatch_vector); + + using fn_ns::PositiveStridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(positive_strided_dispatch_vector); + + using fn_ns::PositiveTypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(positive_output_typeid_vector); +}; + +} // namespace impl + +void init_positive(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_positive_dispatch_vectors(); + using impl::positive_contig_dispatch_vector; + using impl::positive_output_typeid_vector; + using impl::positive_strided_dispatch_vector; + + auto positive_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc(src, dst, exec_q, depends, + positive_output_typeid_vector, + positive_contig_dispatch_vector, + positive_strided_dispatch_vector); + }; + m.def("_positive", positive_pyapi, "", py::arg("src"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto positive_result_type_pyapi = [&](const py::dtype &dtype) { + return py_unary_ufunc_result_type(dtype, + positive_output_typeid_vector); + }; + m.def("_positive_result_type", positive_result_type_pyapi); + } +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/positive.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/positive.hpp new file mode 100644 index 000000000000..05bd04b577af --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/positive.hpp @@ -0,0 +1,46 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern void init_positive(py::module_ m); + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/pow.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/pow.cpp new file mode 100644 index 000000000000..990515fa5402 --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/pow.cpp @@ -0,0 +1,203 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "pow.hpp" +#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/common_inplace.hpp" +#include "kernels/elementwise_functions/pow.hpp" + +namespace dpctl::tensor::py_internal +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::binary_contig_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_strided_impl_fn_ptr_t; + +using ew_cmn_ns::binary_inplace_contig_impl_fn_ptr_t; +using ew_cmn_ns::binary_inplace_row_matrix_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_inplace_strided_impl_fn_ptr_t; + +// B21: ===== POW (x1, x2) +namespace impl +{ + +namespace pow_fn_ns = dpctl::tensor::kernels::pow; + +static binary_contig_impl_fn_ptr_t pow_contig_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +static int pow_output_id_table[td_ns::num_types][td_ns::num_types]; +static int pow_inplace_output_id_table[td_ns::num_types][td_ns::num_types]; + +static binary_strided_impl_fn_ptr_t + pow_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; + +static binary_inplace_contig_impl_fn_ptr_t + pow_inplace_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; +static binary_inplace_strided_impl_fn_ptr_t + pow_inplace_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; + +void populate_pow_dispatch_tables(void) +{ + using namespace td_ns; + namespace fn_ns = pow_fn_ns; + + // which input types are supported, and what is the type of the result + using fn_ns::PowTypeMapFactory; + DispatchTableBuilder dtb1; + dtb1.populate_dispatch_table(pow_output_id_table); + + // function pointers for operation on general strided arrays + using fn_ns::PowStridedFactory; + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(pow_strided_dispatch_table); + + // function pointers for operation on contiguous inputs and output + using fn_ns::PowContigFactory; + DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table(pow_contig_dispatch_table); + + // function pointers for inplace operation on general strided arrays + using fn_ns::PowInplaceStridedFactory; + DispatchTableBuilder + dtb4; + dtb4.populate_dispatch_table(pow_inplace_strided_dispatch_table); + + // function pointers for inplace operation on contiguous inputs and output + using fn_ns::PowInplaceContigFactory; + DispatchTableBuilder + dtb5; + dtb5.populate_dispatch_table(pow_inplace_contig_dispatch_table); + + // which types are supported by the in-place kernels + using fn_ns::PowInplaceTypeMapFactory; + DispatchTableBuilder dtb6; + dtb6.populate_dispatch_table(pow_inplace_output_id_table); +}; + +} // namespace impl + +void init_pow(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_pow_dispatch_tables(); + using impl::pow_contig_dispatch_table; + using impl::pow_output_id_table; + using impl::pow_strided_dispatch_table; + + auto pow_pyapi = [&](const arrayT &src1, const arrayT &src2, + const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_binary_ufunc( + src1, src2, dst, exec_q, depends, pow_output_id_table, + // function pointers to handle operation on contiguous arrays + // (pointers may be nullptr) + pow_contig_dispatch_table, + // function pointers to handle operation on strided arrays (most + // general case) + pow_strided_dispatch_table, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{}, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{}); + }; + auto pow_result_type_pyapi = [&](const py::dtype &dtype1, + const py::dtype &dtype2) { + return py_binary_ufunc_result_type(dtype1, dtype2, + pow_output_id_table); + }; + m.def("_pow", pow_pyapi, "", py::arg("src1"), py::arg("src2"), + py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + m.def("_pow_result_type", pow_result_type_pyapi, ""); + + using impl::pow_inplace_contig_dispatch_table; + using impl::pow_inplace_output_id_table; + using impl::pow_inplace_strided_dispatch_table; + + auto pow_inplace_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_binary_inplace_ufunc( + src, dst, exec_q, depends, pow_inplace_output_id_table, + // function pointers to handle inplace operation on + // contiguous arrays (pointers may be nullptr) + pow_inplace_contig_dispatch_table, + // function pointers to handle inplace operation on strided + // arrays (most general case) + pow_inplace_strided_dispatch_table, + // function pointers to handle inplace operation on + // c-contig matrix with c-contig row with broadcasting + // (may be nullptr) + td_ns::NullPtrTable< + binary_inplace_row_matrix_broadcast_impl_fn_ptr_t>{}); + }; + m.def("_pow_inplace", pow_inplace_pyapi, "", py::arg("lhs"), + py::arg("rhs"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + } +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/pow.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/pow.hpp new file mode 100644 index 000000000000..197a23b80d8a --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/pow.hpp @@ -0,0 +1,46 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern void init_pow(py::module_ m); + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/proj.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/proj.cpp new file mode 100644 index 000000000000..9583de8bd195 --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/proj.cpp @@ -0,0 +1,125 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "proj.hpp" +#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/proj.hpp" + +namespace dpctl::tensor::py_internal +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::unary_contig_impl_fn_ptr_t; +using ew_cmn_ns::unary_strided_impl_fn_ptr_t; + +// U40: ==== PROJ (x) +namespace impl +{ + +namespace proj_fn_ns = dpctl::tensor::kernels::proj; + +static unary_contig_impl_fn_ptr_t proj_contig_dispatch_vector[td_ns::num_types]; +static int proj_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + proj_strided_dispatch_vector[td_ns::num_types]; + +void populate_proj_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = proj_fn_ns; + + using fn_ns::ProjContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(proj_contig_dispatch_vector); + + using fn_ns::ProjStridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(proj_strided_dispatch_vector); + + using fn_ns::ProjTypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(proj_output_typeid_vector); +}; + +} // namespace impl + +void init_proj(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_proj_dispatch_vectors(); + using impl::proj_contig_dispatch_vector; + using impl::proj_output_typeid_vector; + using impl::proj_strided_dispatch_vector; + + auto proj_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc( + src, dst, exec_q, depends, proj_output_typeid_vector, + proj_contig_dispatch_vector, proj_strided_dispatch_vector); + }; + m.def("_proj", proj_pyapi, "", py::arg("src"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto proj_result_type_pyapi = [&](const py::dtype &dtype) { + return py_unary_ufunc_result_type(dtype, proj_output_typeid_vector); + }; + m.def("_proj_result_type", proj_result_type_pyapi); + } +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/proj.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/proj.hpp new file mode 100644 index 000000000000..3cdc0e8271b0 --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/proj.hpp @@ -0,0 +1,46 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern void init_proj(py::module_ m); + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/real.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/real.cpp new file mode 100644 index 000000000000..6ed3f5fdc404 --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/real.cpp @@ -0,0 +1,125 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "real.hpp" +#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/real.hpp" + +namespace dpctl::tensor::py_internal +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::unary_contig_impl_fn_ptr_t; +using ew_cmn_ns::unary_strided_impl_fn_ptr_t; + +// U27: ==== REAL (x) +namespace impl +{ + +namespace real_fn_ns = dpctl::tensor::kernels::real; + +static unary_contig_impl_fn_ptr_t real_contig_dispatch_vector[td_ns::num_types]; +static int real_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + real_strided_dispatch_vector[td_ns::num_types]; + +void populate_real_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = real_fn_ns; + + using fn_ns::RealContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(real_contig_dispatch_vector); + + using fn_ns::RealStridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(real_strided_dispatch_vector); + + using fn_ns::RealTypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(real_output_typeid_vector); +}; + +} // namespace impl + +void init_real(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_real_dispatch_vectors(); + using impl::real_contig_dispatch_vector; + using impl::real_output_typeid_vector; + using impl::real_strided_dispatch_vector; + + auto real_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc( + src, dst, exec_q, depends, real_output_typeid_vector, + real_contig_dispatch_vector, real_strided_dispatch_vector); + }; + m.def("_real", real_pyapi, "", py::arg("src"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto real_result_type_pyapi = [&](const py::dtype &dtype) { + return py_unary_ufunc_result_type(dtype, real_output_typeid_vector); + }; + m.def("_real_result_type", real_result_type_pyapi); + } +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/real.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/real.hpp new file mode 100644 index 000000000000..81f4743e823b --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/real.hpp @@ -0,0 +1,46 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern void init_real(py::module_ m); + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/reciprocal.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/reciprocal.cpp new file mode 100644 index 000000000000..cdb0f43dfbe0 --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/reciprocal.cpp @@ -0,0 +1,129 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "reciprocal.hpp" +#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/reciprocal.hpp" + +namespace dpctl::tensor::py_internal +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::unary_contig_impl_fn_ptr_t; +using ew_cmn_ns::unary_strided_impl_fn_ptr_t; + +// U42: ==== REAL (x) +namespace impl +{ + +namespace reciprocal_fn_ns = dpctl::tensor::kernels::reciprocal; + +static unary_contig_impl_fn_ptr_t + reciprocal_contig_dispatch_vector[td_ns::num_types]; +static int reciprocal_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + reciprocal_strided_dispatch_vector[td_ns::num_types]; + +void populate_reciprocal_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = reciprocal_fn_ns; + + using fn_ns::ReciprocalContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(reciprocal_contig_dispatch_vector); + + using fn_ns::ReciprocalStridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(reciprocal_strided_dispatch_vector); + + using fn_ns::ReciprocalTypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(reciprocal_output_typeid_vector); +}; + +} // namespace impl + +void init_reciprocal(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_reciprocal_dispatch_vectors(); + using impl::reciprocal_contig_dispatch_vector; + using impl::reciprocal_output_typeid_vector; + using impl::reciprocal_strided_dispatch_vector; + + auto reciprocal_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc(src, dst, exec_q, depends, + reciprocal_output_typeid_vector, + reciprocal_contig_dispatch_vector, + reciprocal_strided_dispatch_vector); + }; + m.def("_reciprocal", reciprocal_pyapi, "", py::arg("src"), + py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + + auto reciprocal_result_type_pyapi = [&](const py::dtype &dtype) { + return py_unary_ufunc_result_type(dtype, + reciprocal_output_typeid_vector); + }; + m.def("_reciprocal_result_type", reciprocal_result_type_pyapi); + } +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/reciprocal.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/reciprocal.hpp new file mode 100644 index 000000000000..1d2156f3464e --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/reciprocal.hpp @@ -0,0 +1,46 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern void init_reciprocal(py::module_ m); + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/remainder.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/remainder.cpp new file mode 100644 index 000000000000..8bdcdbe1b3dd --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/remainder.cpp @@ -0,0 +1,205 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "remainder.hpp" +#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/common_inplace.hpp" +#include "kernels/elementwise_functions/remainder.hpp" + +namespace dpctl::tensor::py_internal +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::binary_contig_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_strided_impl_fn_ptr_t; + +using ew_cmn_ns::binary_inplace_contig_impl_fn_ptr_t; +using ew_cmn_ns::binary_inplace_row_matrix_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_inplace_strided_impl_fn_ptr_t; + +// B22: ===== REMAINDER (x1, x2) +namespace impl +{ + +namespace remainder_fn_ns = dpctl::tensor::kernels::remainder; + +static binary_contig_impl_fn_ptr_t + remainder_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; + +static int remainder_output_id_table[td_ns::num_types][td_ns::num_types]; +static int remainder_inplace_output_id_table[td_ns::num_types] + [td_ns::num_types]; + +static binary_strided_impl_fn_ptr_t + remainder_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; + +static binary_inplace_contig_impl_fn_ptr_t + remainder_inplace_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; +static binary_inplace_strided_impl_fn_ptr_t + remainder_inplace_strided_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +void populate_remainder_dispatch_tables(void) +{ + using namespace td_ns; + namespace fn_ns = remainder_fn_ns; + + // which input types are supported, and what is the type of the result + using fn_ns::RemainderTypeMapFactory; + DispatchTableBuilder dtb1; + dtb1.populate_dispatch_table(remainder_output_id_table); + + // function pointers for operation on general strided arrays + using fn_ns::RemainderStridedFactory; + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(remainder_strided_dispatch_table); + + // function pointers for operation on contiguous inputs and output + using fn_ns::RemainderContigFactory; + DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table(remainder_contig_dispatch_table); + + // function pointers for inplace operation on general strided arrays + using fn_ns::RemainderInplaceStridedFactory; + DispatchTableBuilder + dtb4; + dtb4.populate_dispatch_table(remainder_inplace_strided_dispatch_table); + + // function pointers for inplace operation on contiguous inputs and output + using fn_ns::RemainderInplaceContigFactory; + DispatchTableBuilder + dtb5; + dtb5.populate_dispatch_table(remainder_inplace_contig_dispatch_table); + + // which types are supported by the in-place kernels + using fn_ns::RemainderInplaceTypeMapFactory; + DispatchTableBuilder dtb6; + dtb6.populate_dispatch_table(remainder_inplace_output_id_table); +} + +} // namespace impl + +void init_remainder(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_remainder_dispatch_tables(); + using impl::remainder_contig_dispatch_table; + using impl::remainder_output_id_table; + using impl::remainder_strided_dispatch_table; + + auto remainder_pyapi = [&](const arrayT &src1, const arrayT &src2, + const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_binary_ufunc( + src1, src2, dst, exec_q, depends, remainder_output_id_table, + // function pointers to handle operation on contiguous arrays + // (pointers may be nullptr) + remainder_contig_dispatch_table, + // function pointers to handle operation on strided arrays (most + // general case) + remainder_strided_dispatch_table, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{}, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{}); + }; + auto remainder_result_type_pyapi = [&](const py::dtype &dtype1, + const py::dtype &dtype2) { + return py_binary_ufunc_result_type(dtype1, dtype2, + remainder_output_id_table); + }; + m.def("_remainder", remainder_pyapi, "", py::arg("src1"), + py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + m.def("_remainder_result_type", remainder_result_type_pyapi, ""); + + using impl::remainder_inplace_contig_dispatch_table; + using impl::remainder_inplace_output_id_table; + using impl::remainder_inplace_strided_dispatch_table; + + auto remainder_inplace_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_binary_inplace_ufunc( + src, dst, exec_q, depends, remainder_inplace_output_id_table, + // function pointers to handle inplace operation on + // contiguous arrays (pointers may be nullptr) + remainder_inplace_contig_dispatch_table, + // function pointers to handle inplace operation on strided + // arrays (most general case) + remainder_inplace_strided_dispatch_table, + // function pointers to handle inplace operation on + // c-contig matrix with c-contig row with broadcasting + // (may be nullptr) + td_ns::NullPtrTable< + binary_inplace_row_matrix_broadcast_impl_fn_ptr_t>{}); + }; + m.def("_remainder_inplace", remainder_inplace_pyapi, "", py::arg("lhs"), + py::arg("rhs"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + } +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/remainder.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/remainder.hpp new file mode 100644 index 000000000000..c00bdc9e0e6c --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/remainder.hpp @@ -0,0 +1,46 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern void init_remainder(py::module_ m); + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/round.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/round.cpp new file mode 100644 index 000000000000..d651b567c3c1 --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/round.cpp @@ -0,0 +1,126 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "round.hpp" +#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/round.hpp" + +namespace dpctl::tensor::py_internal +{ +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::unary_contig_impl_fn_ptr_t; +using ew_cmn_ns::unary_strided_impl_fn_ptr_t; + +// U28: ==== ROUND (x) +namespace impl +{ + +namespace round_fn_ns = dpctl::tensor::kernels::round; + +static unary_contig_impl_fn_ptr_t + round_contig_dispatch_vector[td_ns::num_types]; +static int round_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + round_strided_dispatch_vector[td_ns::num_types]; + +void populate_round_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = round_fn_ns; + + using fn_ns::RoundContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(round_contig_dispatch_vector); + + using fn_ns::RoundStridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(round_strided_dispatch_vector); + + using fn_ns::RoundTypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(round_output_typeid_vector); +}; + +} // namespace impl + +void init_round(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_round_dispatch_vectors(); + using impl::round_contig_dispatch_vector; + using impl::round_output_typeid_vector; + using impl::round_strided_dispatch_vector; + + auto round_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc( + src, dst, exec_q, depends, round_output_typeid_vector, + round_contig_dispatch_vector, round_strided_dispatch_vector); + }; + m.def("_round", round_pyapi, "", py::arg("src"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto round_result_type_pyapi = [&](const py::dtype &dtype) { + return py_unary_ufunc_result_type(dtype, + round_output_typeid_vector); + }; + m.def("_round_result_type", round_result_type_pyapi); + } +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/round.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/round.hpp new file mode 100644 index 000000000000..ca56e110eec5 --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/round.hpp @@ -0,0 +1,46 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern void init_round(py::module_ m); + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/rsqrt.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/rsqrt.cpp new file mode 100644 index 000000000000..738bef333d75 --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/rsqrt.cpp @@ -0,0 +1,127 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "rsqrt.hpp" +#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/rsqrt.hpp" + +namespace dpctl::tensor::py_internal +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::unary_contig_impl_fn_ptr_t; +using ew_cmn_ns::unary_strided_impl_fn_ptr_t; + +// U39: ==== RSQRT (x) +namespace impl +{ + +namespace rsqrt_fn_ns = dpctl::tensor::kernels::rsqrt; + +static unary_contig_impl_fn_ptr_t + rsqrt_contig_dispatch_vector[td_ns::num_types]; +static int rsqrt_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + rsqrt_strided_dispatch_vector[td_ns::num_types]; + +void populate_rsqrt_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = rsqrt_fn_ns; + + using fn_ns::RsqrtContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(rsqrt_contig_dispatch_vector); + + using fn_ns::RsqrtStridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(rsqrt_strided_dispatch_vector); + + using fn_ns::RsqrtTypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(rsqrt_output_typeid_vector); +}; + +} // namespace impl + +void init_rsqrt(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_rsqrt_dispatch_vectors(); + using impl::rsqrt_contig_dispatch_vector; + using impl::rsqrt_output_typeid_vector; + using impl::rsqrt_strided_dispatch_vector; + + auto rsqrt_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc( + src, dst, exec_q, depends, rsqrt_output_typeid_vector, + rsqrt_contig_dispatch_vector, rsqrt_strided_dispatch_vector); + }; + m.def("_rsqrt", rsqrt_pyapi, "", py::arg("src"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto rsqrt_result_type_pyapi = [&](const py::dtype &dtype) { + return py_unary_ufunc_result_type(dtype, + rsqrt_output_typeid_vector); + }; + m.def("_rsqrt_result_type", rsqrt_result_type_pyapi); + } +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/rsqrt.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/rsqrt.hpp new file mode 100644 index 000000000000..4ba740a31777 --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/rsqrt.hpp @@ -0,0 +1,46 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern void init_rsqrt(py::module_ m); + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/sign.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/sign.cpp new file mode 100644 index 000000000000..5051926e7470 --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/sign.cpp @@ -0,0 +1,125 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "sign.hpp" +#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/sign.hpp" + +namespace dpctl::tensor::py_internal +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::unary_contig_impl_fn_ptr_t; +using ew_cmn_ns::unary_strided_impl_fn_ptr_t; + +// U29: ==== SIGN (x) +namespace impl +{ + +namespace sign_fn_ns = dpctl::tensor::kernels::sign; + +static unary_contig_impl_fn_ptr_t sign_contig_dispatch_vector[td_ns::num_types]; +static int sign_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + sign_strided_dispatch_vector[td_ns::num_types]; + +void populate_sign_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = sign_fn_ns; + + using fn_ns::SignContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(sign_contig_dispatch_vector); + + using fn_ns::SignStridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(sign_strided_dispatch_vector); + + using fn_ns::SignTypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(sign_output_typeid_vector); +}; + +} // namespace impl + +void init_sign(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_sign_dispatch_vectors(); + using impl::sign_contig_dispatch_vector; + using impl::sign_output_typeid_vector; + using impl::sign_strided_dispatch_vector; + + auto sign_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc( + src, dst, exec_q, depends, sign_output_typeid_vector, + sign_contig_dispatch_vector, sign_strided_dispatch_vector); + }; + m.def("_sign", sign_pyapi, "", py::arg("src"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto sign_result_type_pyapi = [&](const py::dtype &dtype) { + return py_unary_ufunc_result_type(dtype, sign_output_typeid_vector); + }; + m.def("_sign_result_type", sign_result_type_pyapi); + } +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/sign.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/sign.hpp new file mode 100644 index 000000000000..19686ada3dbf --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/sign.hpp @@ -0,0 +1,46 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern void init_sign(py::module_ m); + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/signbit.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/signbit.cpp new file mode 100644 index 000000000000..eeef1de50331 --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/signbit.cpp @@ -0,0 +1,128 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "signbit.hpp" +#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/signbit.hpp" + +namespace dpctl::tensor::py_internal +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::unary_contig_impl_fn_ptr_t; +using ew_cmn_ns::unary_strided_impl_fn_ptr_t; + +// U41: ==== SIGNBIT (x) +namespace impl +{ + +namespace signbit_fn_ns = dpctl::tensor::kernels::signbit; + +static unary_contig_impl_fn_ptr_t + signbit_contig_dispatch_vector[td_ns::num_types]; +static int signbit_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + signbit_strided_dispatch_vector[td_ns::num_types]; + +void populate_signbit_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = signbit_fn_ns; + + using fn_ns::SignbitContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(signbit_contig_dispatch_vector); + + using fn_ns::SignbitStridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(signbit_strided_dispatch_vector); + + using fn_ns::SignbitTypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(signbit_output_typeid_vector); +}; + +} // namespace impl + +void init_signbit(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_signbit_dispatch_vectors(); + using impl::signbit_contig_dispatch_vector; + using impl::signbit_output_typeid_vector; + using impl::signbit_strided_dispatch_vector; + + auto signbit_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc(src, dst, exec_q, depends, + signbit_output_typeid_vector, + signbit_contig_dispatch_vector, + signbit_strided_dispatch_vector); + }; + m.def("_signbit", signbit_pyapi, "", py::arg("src"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto signbit_result_type_pyapi = [&](const py::dtype &dtype) { + return py_unary_ufunc_result_type(dtype, + signbit_output_typeid_vector); + }; + m.def("_signbit_result_type", signbit_result_type_pyapi); + } +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/signbit.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/signbit.hpp new file mode 100644 index 000000000000..292386b174fc --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/signbit.hpp @@ -0,0 +1,46 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern void init_signbit(py::module_ m); + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/sin.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/sin.cpp new file mode 100644 index 000000000000..7db753e27c4b --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/sin.cpp @@ -0,0 +1,125 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "sin.hpp" +#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/sin.hpp" + +namespace dpctl::tensor::py_internal +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::unary_contig_impl_fn_ptr_t; +using ew_cmn_ns::unary_strided_impl_fn_ptr_t; + +// U30: ==== SIN (x) +namespace impl +{ + +namespace sin_fn_ns = dpctl::tensor::kernels::sin; + +static unary_contig_impl_fn_ptr_t sin_contig_dispatch_vector[td_ns::num_types]; +static int sin_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + sin_strided_dispatch_vector[td_ns::num_types]; + +void populate_sin_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = sin_fn_ns; + + using fn_ns::SinContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(sin_contig_dispatch_vector); + + using fn_ns::SinStridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(sin_strided_dispatch_vector); + + using fn_ns::SinTypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(sin_output_typeid_vector); +}; + +} // namespace impl + +void init_sin(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_sin_dispatch_vectors(); + using impl::sin_contig_dispatch_vector; + using impl::sin_output_typeid_vector; + using impl::sin_strided_dispatch_vector; + + auto sin_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc( + src, dst, exec_q, depends, sin_output_typeid_vector, + sin_contig_dispatch_vector, sin_strided_dispatch_vector); + }; + m.def("_sin", sin_pyapi, "", py::arg("src"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto sin_result_type_pyapi = [&](const py::dtype &dtype) { + return py_unary_ufunc_result_type(dtype, sin_output_typeid_vector); + }; + m.def("_sin_result_type", sin_result_type_pyapi); + } +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/sin.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/sin.hpp new file mode 100644 index 000000000000..a4b3da08b7fc --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/sin.hpp @@ -0,0 +1,46 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern void init_sin(py::module_ m); + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/sinh.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/sinh.cpp new file mode 100644 index 000000000000..e56a28e0c2aa --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/sinh.cpp @@ -0,0 +1,125 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "sinh.hpp" +#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/sinh.hpp" + +namespace dpctl::tensor::py_internal +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::unary_contig_impl_fn_ptr_t; +using ew_cmn_ns::unary_strided_impl_fn_ptr_t; + +// U31: ==== SINH (x) +namespace impl +{ + +namespace sinh_fn_ns = dpctl::tensor::kernels::sinh; + +static unary_contig_impl_fn_ptr_t sinh_contig_dispatch_vector[td_ns::num_types]; +static int sinh_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + sinh_strided_dispatch_vector[td_ns::num_types]; + +void populate_sinh_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = sinh_fn_ns; + + using fn_ns::SinhContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(sinh_contig_dispatch_vector); + + using fn_ns::SinhStridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(sinh_strided_dispatch_vector); + + using fn_ns::SinhTypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(sinh_output_typeid_vector); +}; + +} // namespace impl + +void init_sinh(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_sinh_dispatch_vectors(); + using impl::sinh_contig_dispatch_vector; + using impl::sinh_output_typeid_vector; + using impl::sinh_strided_dispatch_vector; + + auto sinh_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc( + src, dst, exec_q, depends, sinh_output_typeid_vector, + sinh_contig_dispatch_vector, sinh_strided_dispatch_vector); + }; + m.def("_sinh", sinh_pyapi, "", py::arg("src"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto sinh_result_type_pyapi = [&](const py::dtype &dtype) { + return py_unary_ufunc_result_type(dtype, sinh_output_typeid_vector); + }; + m.def("_sinh_result_type", sinh_result_type_pyapi); + } +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/sinh.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/sinh.hpp new file mode 100644 index 000000000000..4a0d90d24c8c --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/sinh.hpp @@ -0,0 +1,46 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern void init_sinh(py::module_ m); + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/sqrt.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/sqrt.cpp new file mode 100644 index 000000000000..a4a715147055 --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/sqrt.cpp @@ -0,0 +1,125 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "sqrt.hpp" +#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/sqrt.hpp" + +namespace dpctl::tensor::py_internal +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::unary_contig_impl_fn_ptr_t; +using ew_cmn_ns::unary_strided_impl_fn_ptr_t; + +// U33: ==== SQRT (x) +namespace impl +{ + +namespace sqrt_fn_ns = dpctl::tensor::kernels::sqrt; + +static unary_contig_impl_fn_ptr_t sqrt_contig_dispatch_vector[td_ns::num_types]; +static int sqrt_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + sqrt_strided_dispatch_vector[td_ns::num_types]; + +void populate_sqrt_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = sqrt_fn_ns; + + using fn_ns::SqrtContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(sqrt_contig_dispatch_vector); + + using fn_ns::SqrtStridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(sqrt_strided_dispatch_vector); + + using fn_ns::SqrtTypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(sqrt_output_typeid_vector); +}; + +} // namespace impl + +void init_sqrt(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_sqrt_dispatch_vectors(); + using impl::sqrt_contig_dispatch_vector; + using impl::sqrt_output_typeid_vector; + using impl::sqrt_strided_dispatch_vector; + + auto sqrt_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc( + src, dst, exec_q, depends, sqrt_output_typeid_vector, + sqrt_contig_dispatch_vector, sqrt_strided_dispatch_vector); + }; + m.def("_sqrt", sqrt_pyapi, "", py::arg("src"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto sqrt_result_type_pyapi = [&](const py::dtype &dtype) { + return py_unary_ufunc_result_type(dtype, sqrt_output_typeid_vector); + }; + m.def("_sqrt_result_type", sqrt_result_type_pyapi); + } +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/sqrt.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/sqrt.hpp new file mode 100644 index 000000000000..e8f7014c1afc --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/sqrt.hpp @@ -0,0 +1,46 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern void init_sqrt(py::module_ m); + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/square.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/square.cpp new file mode 100644 index 000000000000..d3e229ae42fc --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/square.cpp @@ -0,0 +1,127 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "square.hpp" +#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/square.hpp" + +namespace dpctl::tensor::py_internal +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::unary_contig_impl_fn_ptr_t; +using ew_cmn_ns::unary_strided_impl_fn_ptr_t; + +// U32: ==== SQUARE (x) +namespace impl +{ + +namespace square_fn_ns = dpctl::tensor::kernels::square; + +static unary_contig_impl_fn_ptr_t + square_contig_dispatch_vector[td_ns::num_types]; +static int square_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + square_strided_dispatch_vector[td_ns::num_types]; + +void populate_square_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = square_fn_ns; + + using fn_ns::SquareContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(square_contig_dispatch_vector); + + using fn_ns::SquareStridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(square_strided_dispatch_vector); + + using fn_ns::SquareTypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(square_output_typeid_vector); +}; + +} // namespace impl + +void init_square(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_square_dispatch_vectors(); + using impl::square_contig_dispatch_vector; + using impl::square_output_typeid_vector; + using impl::square_strided_dispatch_vector; + + auto square_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc( + src, dst, exec_q, depends, square_output_typeid_vector, + square_contig_dispatch_vector, square_strided_dispatch_vector); + }; + m.def("_square", square_pyapi, "", py::arg("src"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto square_result_type_pyapi = [&](const py::dtype &dtype) { + return py_unary_ufunc_result_type(dtype, + square_output_typeid_vector); + }; + m.def("_square_result_type", square_result_type_pyapi); + } +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/square.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/square.hpp new file mode 100644 index 000000000000..3f23f184499c --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/square.hpp @@ -0,0 +1,46 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern void init_square(py::module_ m); + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/subtract.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/subtract.cpp new file mode 100644 index 000000000000..ec6edaa52dd5 --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/subtract.cpp @@ -0,0 +1,243 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "subtract.hpp" +#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/common_inplace.hpp" +#include "kernels/elementwise_functions/subtract.hpp" + +namespace dpctl::tensor::py_internal +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::binary_contig_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_strided_impl_fn_ptr_t; + +using ew_cmn_ns::binary_inplace_contig_impl_fn_ptr_t; +using ew_cmn_ns::binary_inplace_row_matrix_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_inplace_strided_impl_fn_ptr_t; + +// B23: ===== SUBTRACT (x1, x2) +namespace impl +{ +namespace subtract_fn_ns = dpctl::tensor::kernels::subtract; + +static binary_contig_impl_fn_ptr_t + subtract_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; + +static int subtract_output_id_table[td_ns::num_types][td_ns::num_types]; +static int subtract_inplace_output_id_table[td_ns::num_types][td_ns::num_types]; + +static binary_strided_impl_fn_ptr_t + subtract_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; + +// sub(matrix, row) +static binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t + subtract_contig_matrix_contig_row_broadcast_dispatch_table + [td_ns::num_types][td_ns::num_types]; + +// sub(row, matrix) +static binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t + subtract_contig_row_contig_matrix_broadcast_dispatch_table + [td_ns::num_types][td_ns::num_types]; + +static binary_inplace_contig_impl_fn_ptr_t + subtract_inplace_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; +static binary_inplace_strided_impl_fn_ptr_t + subtract_inplace_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; +static binary_inplace_row_matrix_broadcast_impl_fn_ptr_t + subtract_inplace_row_matrix_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +void populate_subtract_dispatch_tables(void) +{ + using namespace td_ns; + namespace fn_ns = subtract_fn_ns; + + // which input types are supported, and what is the type of the result + using fn_ns::SubtractTypeMapFactory; + DispatchTableBuilder dtb1; + dtb1.populate_dispatch_table(subtract_output_id_table); + + // function pointers for operation on general strided arrays + using fn_ns::SubtractStridedFactory; + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(subtract_strided_dispatch_table); + + // function pointers for operation on contiguous inputs and output + using fn_ns::SubtractContigFactory; + DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table(subtract_contig_dispatch_table); + + // function pointers for operation on contiguous matrix, contiguous row + // with contiguous matrix output + using fn_ns::SubtractContigMatrixContigRowBroadcastFactory; + DispatchTableBuilder< + binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t, + SubtractContigMatrixContigRowBroadcastFactory, num_types> + dtb4; + dtb4.populate_dispatch_table( + subtract_contig_matrix_contig_row_broadcast_dispatch_table); + + // function pointers for operation on contiguous row, contiguous matrix + // with contiguous matrix output + using fn_ns::SubtractContigRowContigMatrixBroadcastFactory; + DispatchTableBuilder< + binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t, + SubtractContigRowContigMatrixBroadcastFactory, num_types> + dtb5; + dtb5.populate_dispatch_table( + subtract_contig_row_contig_matrix_broadcast_dispatch_table); + + // function pointers for inplace operation on general strided arrays + using fn_ns::SubtractInplaceStridedFactory; + DispatchTableBuilder + dtb6; + dtb6.populate_dispatch_table(subtract_inplace_strided_dispatch_table); + + // function pointers for inplace operation on contiguous inputs and output + using fn_ns::SubtractInplaceContigFactory; + DispatchTableBuilder + dtb7; + dtb7.populate_dispatch_table(subtract_inplace_contig_dispatch_table); + + // function pointers for inplace operation on contiguous matrix + // and contiguous row + using fn_ns::SubtractInplaceRowMatrixBroadcastFactory; + DispatchTableBuilder + dtb8; + dtb8.populate_dispatch_table(subtract_inplace_row_matrix_dispatch_table); + + // which types are supported by the in-place kernels + using fn_ns::SubtractInplaceTypeMapFactory; + DispatchTableBuilder dtb9; + dtb9.populate_dispatch_table(subtract_inplace_output_id_table); +}; + +} // namespace impl + +void init_subtract(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_subtract_dispatch_tables(); + using impl::subtract_contig_dispatch_table; + using impl::subtract_contig_matrix_contig_row_broadcast_dispatch_table; + using impl::subtract_contig_row_contig_matrix_broadcast_dispatch_table; + using impl::subtract_output_id_table; + using impl::subtract_strided_dispatch_table; + + auto subtract_pyapi = [&](const arrayT &src1, const arrayT &src2, + const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_binary_ufunc( + src1, src2, dst, exec_q, depends, subtract_output_id_table, + // function pointers to handle operation on contiguous + // arrays (pointers may be nullptr) + subtract_contig_dispatch_table, + // function pointers to handle operation on strided arrays + // (most general case) + subtract_strided_dispatch_table, + // function pointers to handle operation of c-contig matrix + // and c-contig row with broadcasting (may be nullptr) + subtract_contig_matrix_contig_row_broadcast_dispatch_table, + // function pointers to handle operation of c-contig matrix + // and c-contig row with broadcasting (may be nullptr) + subtract_contig_row_contig_matrix_broadcast_dispatch_table); + }; + auto subtract_result_type_pyapi = [&](const py::dtype &dtype1, + const py::dtype &dtype2) { + return py_binary_ufunc_result_type(dtype1, dtype2, + subtract_output_id_table); + }; + m.def("_subtract", subtract_pyapi, "", py::arg("src1"), py::arg("src2"), + py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + m.def("_subtract_result_type", subtract_result_type_pyapi, ""); + + using impl::subtract_inplace_contig_dispatch_table; + using impl::subtract_inplace_output_id_table; + using impl::subtract_inplace_row_matrix_dispatch_table; + using impl::subtract_inplace_strided_dispatch_table; + + auto subtract_inplace_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_binary_inplace_ufunc( + src, dst, exec_q, depends, subtract_inplace_output_id_table, + // function pointers to handle inplace operation on + // contiguous arrays (pointers may be nullptr) + subtract_inplace_contig_dispatch_table, + // function pointers to handle inplace operation on strided + // arrays (most general case) + subtract_inplace_strided_dispatch_table, + // function pointers to handle inplace operation on + // c-contig matrix with c-contig row with broadcasting + // (may be nullptr) + subtract_inplace_row_matrix_dispatch_table); + }; + m.def("_subtract_inplace", subtract_inplace_pyapi, "", py::arg("lhs"), + py::arg("rhs"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + } +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/subtract.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/subtract.hpp new file mode 100644 index 000000000000..89cdfd6d0ea0 --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/subtract.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2025 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_subtract(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/tan.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/tan.cpp new file mode 100644 index 000000000000..8abdea0e5283 --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/tan.cpp @@ -0,0 +1,125 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "tan.hpp" +#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/tan.hpp" + +namespace dpctl::tensor::py_internal +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::unary_contig_impl_fn_ptr_t; +using ew_cmn_ns::unary_strided_impl_fn_ptr_t; + +// U34: ==== TAN (x) +namespace impl +{ + +namespace tan_fn_ns = dpctl::tensor::kernels::tan; + +static unary_contig_impl_fn_ptr_t tan_contig_dispatch_vector[td_ns::num_types]; +static int tan_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + tan_strided_dispatch_vector[td_ns::num_types]; + +void populate_tan_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = tan_fn_ns; + + using fn_ns::TanContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(tan_contig_dispatch_vector); + + using fn_ns::TanStridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(tan_strided_dispatch_vector); + + using fn_ns::TanTypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(tan_output_typeid_vector); +}; + +} // namespace impl + +void init_tan(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_tan_dispatch_vectors(); + using impl::tan_contig_dispatch_vector; + using impl::tan_output_typeid_vector; + using impl::tan_strided_dispatch_vector; + + auto tan_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc( + src, dst, exec_q, depends, tan_output_typeid_vector, + tan_contig_dispatch_vector, tan_strided_dispatch_vector); + }; + m.def("_tan", tan_pyapi, "", py::arg("src"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto tan_result_type_pyapi = [&](const py::dtype &dtype) { + return py_unary_ufunc_result_type(dtype, tan_output_typeid_vector); + }; + m.def("_tan_result_type", tan_result_type_pyapi); + } +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/tan.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/tan.hpp new file mode 100644 index 000000000000..b0818a9a85c2 --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/tan.hpp @@ -0,0 +1,46 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern void init_tan(py::module_ m); + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/tanh.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/tanh.cpp new file mode 100644 index 000000000000..bf8ff205c0af --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/tanh.cpp @@ -0,0 +1,125 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "tanh.hpp" +#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/tanh.hpp" + +namespace dpctl::tensor::py_internal +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::unary_contig_impl_fn_ptr_t; +using ew_cmn_ns::unary_strided_impl_fn_ptr_t; + +// U35: ==== TANH (x) +namespace impl +{ + +namespace tanh_fn_ns = dpctl::tensor::kernels::tanh; + +static unary_contig_impl_fn_ptr_t tanh_contig_dispatch_vector[td_ns::num_types]; +static int tanh_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + tanh_strided_dispatch_vector[td_ns::num_types]; + +void populate_tanh_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = tanh_fn_ns; + + using fn_ns::TanhContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(tanh_contig_dispatch_vector); + + using fn_ns::TanhStridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(tanh_strided_dispatch_vector); + + using fn_ns::TanhTypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(tanh_output_typeid_vector); +}; + +} // namespace impl + +void init_tanh(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_tanh_dispatch_vectors(); + using impl::tanh_contig_dispatch_vector; + using impl::tanh_output_typeid_vector; + using impl::tanh_strided_dispatch_vector; + + auto tanh_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc( + src, dst, exec_q, depends, tanh_output_typeid_vector, + tanh_contig_dispatch_vector, tanh_strided_dispatch_vector); + }; + m.def("_tanh", tanh_pyapi, "", py::arg("src"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto tanh_result_type_pyapi = [&](const py::dtype &dtype) { + return py_unary_ufunc_result_type(dtype, tanh_output_typeid_vector); + }; + m.def("_tanh_result_type", tanh_result_type_pyapi); + } +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/tanh.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/tanh.hpp new file mode 100644 index 000000000000..d29c924d5e73 --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/tanh.hpp @@ -0,0 +1,46 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern void init_tanh(py::module_ m); + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/true_divide.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/true_divide.cpp new file mode 100644 index 000000000000..4c1a117fbcae --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/true_divide.cpp @@ -0,0 +1,500 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#include +#include +#include +#include +#include +#include +#include // for std::ignore +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "simplify_iteration_space.hpp" +#include "true_divide.hpp" + +#include "utils/memory_overlap.hpp" +#include "utils/offset_utils.hpp" +#include "utils/output_validation.hpp" +#include "utils/sycl_alloc_utils.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/common_inplace.hpp" +#include "kernels/elementwise_functions/true_divide.hpp" + +namespace dpctl::tensor::py_internal +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::binary_contig_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_strided_impl_fn_ptr_t; + +using ew_cmn_ns::binary_inplace_contig_impl_fn_ptr_t; +using ew_cmn_ns::binary_inplace_row_matrix_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_inplace_strided_impl_fn_ptr_t; + +// B08: ===== DIVIDE (x1, x2) +namespace impl +{ +namespace true_divide_fn_ns = dpctl::tensor::kernels::true_divide; + +static binary_contig_impl_fn_ptr_t + true_divide_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; +static int true_divide_output_id_table[td_ns::num_types][td_ns::num_types]; +static int true_divide_inplace_output_id_table[td_ns::num_types] + [td_ns::num_types]; + +static binary_strided_impl_fn_ptr_t + true_divide_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; + +// divide(matrix, row) +static binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t + true_divide_contig_matrix_contig_row_broadcast_dispatch_table + [td_ns::num_types][td_ns::num_types]; + +// divide(row, matrix) +static binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t + true_divide_contig_row_contig_matrix_broadcast_dispatch_table + [td_ns::num_types][td_ns::num_types]; + +static binary_inplace_contig_impl_fn_ptr_t + true_divide_inplace_contig_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static binary_inplace_strided_impl_fn_ptr_t + true_divide_inplace_strided_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static binary_inplace_row_matrix_broadcast_impl_fn_ptr_t + true_divide_inplace_row_matrix_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +void populate_true_divide_dispatch_tables(void) +{ + using namespace td_ns; + namespace fn_ns = true_divide_fn_ns; + + // which input types are supported, and what is the type of the result + using fn_ns::TrueDivideTypeMapFactory; + DispatchTableBuilder dtb1; + dtb1.populate_dispatch_table(true_divide_output_id_table); + + // function pointers for operation on general strided arrays + using fn_ns::TrueDivideStridedFactory; + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(true_divide_strided_dispatch_table); + + // function pointers for operation on contiguous inputs and output + using fn_ns::TrueDivideContigFactory; + DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table(true_divide_contig_dispatch_table); + + // function pointers for operation on contiguous matrix, contiguous row + // with contiguous matrix output + using fn_ns::TrueDivideContigMatrixContigRowBroadcastFactory; + DispatchTableBuilder< + binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t, + TrueDivideContigMatrixContigRowBroadcastFactory, num_types> + dtb4; + dtb4.populate_dispatch_table( + true_divide_contig_matrix_contig_row_broadcast_dispatch_table); + + // function pointers for operation on contiguous row, contiguous matrix + // with contiguous matrix output + using fn_ns::TrueDivideContigRowContigMatrixBroadcastFactory; + DispatchTableBuilder< + binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t, + TrueDivideContigRowContigMatrixBroadcastFactory, num_types> + dtb5; + dtb5.populate_dispatch_table( + true_divide_contig_row_contig_matrix_broadcast_dispatch_table); + + // which types are supported by the in-place kernels + using fn_ns::TrueDivideInplaceTypeMapFactory; + DispatchTableBuilder dtb6; + dtb6.populate_dispatch_table(true_divide_inplace_output_id_table); + + // function pointers for inplace operation on general strided arrays + using fn_ns::TrueDivideInplaceStridedFactory; + DispatchTableBuilder + dtb7; + dtb7.populate_dispatch_table(true_divide_inplace_strided_dispatch_table); + + // function pointers for inplace operation on contiguous inputs and output + using fn_ns::TrueDivideInplaceContigFactory; + DispatchTableBuilder + dtb8; + dtb8.populate_dispatch_table(true_divide_inplace_contig_dispatch_table); + + // function pointers for inplace operation on contiguous matrix + // and contiguous row + using fn_ns::TrueDivideInplaceRowMatrixBroadcastFactory; + DispatchTableBuilder + dtb9; + dtb9.populate_dispatch_table(true_divide_inplace_row_matrix_dispatch_table); +}; + +template +class divide_by_scalar_krn; + +typedef sycl::event (*divide_by_scalar_fn_ptr_t)( + sycl::queue &, + std::size_t, + int, + const ssize_t *, + const char *, + py::ssize_t, + const char *, + char *, + py::ssize_t, + const std::vector &); + +template +sycl::event divide_by_scalar(sycl::queue &exec_q, + std::size_t nelems, + int nd, + const ssize_t *shape_and_strides, + const char *arg_p, + py::ssize_t arg_offset, + const char *scalar_ptr, + char *res_p, + py::ssize_t res_offset, + const std::vector &depends = {}) +{ + const scalarT sc_v = *reinterpret_cast(scalar_ptr); + + sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + using BinOpT = + dpctl::tensor::kernels::true_divide::TrueDivideFunctor; + + auto op = BinOpT(); + + using IndexerT = + typename dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer; + + const IndexerT two_offsets_indexer{nd, arg_offset, res_offset, + shape_and_strides}; + + const T *arg_tp = reinterpret_cast(arg_p); + T *res_tp = reinterpret_cast(res_p); + + cgh.parallel_for>( + {nelems}, [=](sycl::id<1> id) { + const auto &two_offsets_ = + two_offsets_indexer(static_cast(id.get(0))); + + const auto &arg_i = two_offsets_.get_first_offset(); + const auto &res_i = two_offsets_.get_second_offset(); + res_tp[res_i] = op(arg_tp[arg_i], sc_v); + }); + }); + return comp_ev; +} + +std::pair + py_divide_by_scalar(const dpctl::tensor::usm_ndarray &src, + double scalar, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends = {}) +{ + int src_typenum = src.get_typenum(); + int dst_typenum = dst.get_typenum(); + + auto array_types = td_ns::usm_ndarray_types(); + int src_typeid = array_types.typenum_to_lookup_id(src_typenum); + int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum); + + if (src_typeid != dst_typeid) { + throw py::value_error( + "Destination array has unexpected elemental data type."); + } + + // check that queues are compatible + if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst); + // check shapes, broadcasting is assumed done by caller + // check that dimensions are the same + int dst_nd = dst.get_ndim(); + if (dst_nd != src.get_ndim()) { + throw py::value_error("Array dimensions are not the same."); + } + + // check that shapes are the same + const py::ssize_t *src_shape = src.get_shape_raw(); + const py::ssize_t *dst_shape = dst.get_shape_raw(); + bool shapes_equal(true); + std::size_t src_nelems(1); + + for (int i = 0; i < dst_nd; ++i) { + src_nelems *= static_cast(src_shape[i]); + shapes_equal = shapes_equal && (src_shape[i] == dst_shape[i]); + } + if (!shapes_equal) { + throw py::value_error("Array shapes are not the same."); + } + + // if nelems is zero, return + if (src_nelems == 0) { + return std::make_pair(sycl::event(), sycl::event()); + } + + dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(dst, src_nelems); + + auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); + auto const &same_logical_tensors = + dpctl::tensor::overlap::SameLogicalTensors(); + if ((overlap(src, dst) && !same_logical_tensors(src, dst))) { + throw py::value_error("Arrays index overlapping segments of memory"); + } + + const char *src_data = src.get_data(); + char *dst_data = dst.get_data(); + + static constexpr int float16_typeid = + static_cast(td_ns::typenum_t::HALF); + static constexpr int float32_typeid = + static_cast(td_ns::typenum_t::FLOAT); + static constexpr int float64_typeid = + static_cast(td_ns::typenum_t::DOUBLE); + static constexpr int complex64_typeid = + static_cast(td_ns::typenum_t::CFLOAT); + static constexpr int complex128_typeid = + static_cast(td_ns::typenum_t::CDOUBLE); + + // statically pre-allocated memory for scalar + alignas(double) char scalar_alloc[sizeof(double)] = {0}; + + divide_by_scalar_fn_ptr_t fn; + // placement new into stack memory means no call to delete is necessary + switch (src_typeid) { + case float16_typeid: + { + fn = divide_by_scalar; + std::ignore = + new (scalar_alloc) sycl::half(static_cast(scalar)); + break; + } + case float32_typeid: + { + fn = divide_by_scalar; + std::ignore = new (scalar_alloc) float(scalar); + break; + } + case float64_typeid: + { + fn = divide_by_scalar; + std::ignore = new (scalar_alloc) double(scalar); + break; + } + case complex64_typeid: + { + fn = divide_by_scalar, float>; + std::ignore = new (scalar_alloc) float(scalar); + break; + } + case complex128_typeid: + { + fn = divide_by_scalar, double>; + std::ignore = new (scalar_alloc) double(scalar); + break; + } + default: + throw std::runtime_error("Implementation is missing for typeid=" + + std::to_string(src_typeid)); + } + + // simplify strides + auto const &src_strides = src.get_strides_vector(); + auto const &dst_strides = dst.get_strides_vector(); + + using shT = std::vector; + shT simplified_shape; + shT simplified_src_strides; + shT simplified_dst_strides; + py::ssize_t src_offset(0); + py::ssize_t dst_offset(0); + + int nd = dst_nd; + const py::ssize_t *shape = src_shape; + + std::vector host_tasks{}; + simplify_iteration_space(nd, shape, src_strides, dst_strides, + // outputs + simplified_shape, simplified_src_strides, + simplified_dst_strides, src_offset, dst_offset); + + if (nd == 0) { + // handle 0d array as 1d array with 1 element + static constexpr py::ssize_t one{1}; + simplified_shape.push_back(one); + simplified_src_strides.push_back(one); + simplified_dst_strides.push_back(one); + src_offset = 0; + dst_offset = 0; + } + + using dpctl::tensor::offset_utils::device_allocate_and_pack; + auto ptr_sz_event_triple_ = device_allocate_and_pack( + exec_q, host_tasks, simplified_shape, simplified_src_strides, + simplified_dst_strides); + auto shape_strides_owner = std::move(std::get<0>(ptr_sz_event_triple_)); + auto ©_metadata_ev = std::get<2>(ptr_sz_event_triple_); + + const py::ssize_t *shape_strides = shape_strides_owner.get(); + + std::vector all_deps; + all_deps.reserve(depends.size() + 1); + all_deps.resize(depends.size()); + std::copy(depends.begin(), depends.end(), all_deps.begin()); + all_deps.push_back(copy_metadata_ev); + + sycl::event div_ev = + fn(exec_q, src_nelems, nd, shape_strides, src_data, src_offset, + scalar_alloc, dst_data, dst_offset, all_deps); + + // async free of shape_strides temporary + sycl::event tmp_cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {div_ev}, shape_strides_owner); + + host_tasks.push_back(tmp_cleanup_ev); + + return std::make_pair( + dpctl::utils::keep_args_alive(exec_q, {src, dst}, host_tasks), div_ev); +} + +} // namespace impl + +void init_divide(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_true_divide_dispatch_tables(); + using impl::true_divide_contig_dispatch_table; + using impl:: + true_divide_contig_matrix_contig_row_broadcast_dispatch_table; + using impl:: + true_divide_contig_row_contig_matrix_broadcast_dispatch_table; + using impl::true_divide_output_id_table; + using impl::true_divide_strided_dispatch_table; + + auto divide_pyapi = [&](const arrayT &src1, const arrayT &src2, + const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_binary_ufunc( + src1, src2, dst, exec_q, depends, true_divide_output_id_table, + // function pointers to handle operation on contiguous arrays + // (pointers may be nullptr) + true_divide_contig_dispatch_table, + // function pointers to handle operation on strided arrays (most + // general case) + true_divide_strided_dispatch_table, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + true_divide_contig_matrix_contig_row_broadcast_dispatch_table, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + true_divide_contig_row_contig_matrix_broadcast_dispatch_table); + }; + auto divide_result_type_pyapi = [&](const py::dtype &dtype1, + const py::dtype &dtype2) { + return py_binary_ufunc_result_type(dtype1, dtype2, + true_divide_output_id_table); + }; + m.def("_divide", divide_pyapi, "", py::arg("src1"), py::arg("src2"), + py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + m.def("_divide_result_type", divide_result_type_pyapi, ""); + + using impl::true_divide_inplace_contig_dispatch_table; + using impl::true_divide_inplace_output_id_table; + using impl::true_divide_inplace_row_matrix_dispatch_table; + using impl::true_divide_inplace_strided_dispatch_table; + + auto divide_inplace_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_binary_inplace_ufunc( + src, dst, exec_q, depends, true_divide_inplace_output_id_table, + // function pointers to handle inplace operation on + // contiguous arrays (pointers may be nullptr) + true_divide_inplace_contig_dispatch_table, + // function pointers to handle inplace operation on strided + // arrays (most general case) + true_divide_inplace_strided_dispatch_table, + // function pointers to handle inplace operation on + // c-contig matrix with c-contig row with broadcasting + // (may be nullptr) + true_divide_inplace_row_matrix_dispatch_table); + }; + m.def("_divide_inplace", divide_inplace_pyapi, "", py::arg("lhs"), + py::arg("rhs"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + + using impl::py_divide_by_scalar; + m.def("_divide_by_scalar", &py_divide_by_scalar, "", py::arg("src"), + py::arg("scalar"), py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + } +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/true_divide.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/true_divide.hpp new file mode 100644 index 000000000000..941384beaf8d --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/true_divide.hpp @@ -0,0 +1,46 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern void init_divide(py::module_ m); + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/trunc.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/trunc.cpp new file mode 100644 index 000000000000..3a798d8e110d --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/trunc.cpp @@ -0,0 +1,127 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "trunc.hpp" +#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/trunc.hpp" + +namespace dpctl::tensor::py_internal +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::unary_contig_impl_fn_ptr_t; +using ew_cmn_ns::unary_strided_impl_fn_ptr_t; + +// U36: ==== TRUNC (x) +namespace impl +{ + +namespace trunc_fn_ns = dpctl::tensor::kernels::trunc; + +static unary_contig_impl_fn_ptr_t + trunc_contig_dispatch_vector[td_ns::num_types]; +static int trunc_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + trunc_strided_dispatch_vector[td_ns::num_types]; + +void populate_trunc_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = trunc_fn_ns; + + using fn_ns::TruncContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(trunc_contig_dispatch_vector); + + using fn_ns::TruncStridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(trunc_strided_dispatch_vector); + + using fn_ns::TruncTypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(trunc_output_typeid_vector); +}; + +} // namespace impl + +void init_trunc(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_trunc_dispatch_vectors(); + using impl::trunc_contig_dispatch_vector; + using impl::trunc_output_typeid_vector; + using impl::trunc_strided_dispatch_vector; + + auto trunc_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc( + src, dst, exec_q, depends, trunc_output_typeid_vector, + trunc_contig_dispatch_vector, trunc_strided_dispatch_vector); + }; + m.def("_trunc", trunc_pyapi, "", py::arg("src"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto trunc_result_type_pyapi = [&](const py::dtype &dtype) { + return py_unary_ufunc_result_type(dtype, + trunc_output_typeid_vector); + }; + m.def("_trunc_result_type", trunc_result_type_pyapi); + } +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/trunc.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/trunc.hpp new file mode 100644 index 000000000000..79ed6b5ded14 --- /dev/null +++ b/dpnp/tensor/libtensor/source/elementwise_functions/trunc.hpp @@ -0,0 +1,46 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension, specifically functions for elementwise operations. +//===---------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern void init_trunc(py::module_ m); + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/eye_ctor.cpp b/dpnp/tensor/libtensor/source/eye_ctor.cpp new file mode 100644 index 000000000000..025a7d58d06e --- /dev/null +++ b/dpnp/tensor/libtensor/source/eye_ctor.cpp @@ -0,0 +1,142 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===--------------------------------------------------------------------===// + +#include +#include +#include +#include + +#include + +#include "dpnp4pybind11.hpp" +#include + +#include "eye_ctor.hpp" +#include "kernels/constructors.hpp" +#include "utils/output_validation.hpp" +#include "utils/type_dispatch.hpp" + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace dpctl::tensor::py_internal +{ + +using dpctl::utils::keep_args_alive; + +using dpctl::tensor::kernels::constructors::eye_fn_ptr_t; +static eye_fn_ptr_t eye_dispatch_vector[td_ns::num_types]; + +std::pair + usm_ndarray_eye(py::ssize_t k, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends) +{ + // dst must be 2D + + if (dst.get_ndim() != 2) { + throw py::value_error( + "usm_ndarray_eye: Expecting 2D array to populate"); + } + + if (!dpctl::utils::queues_are_compatible(exec_q, {dst})) { + throw py::value_error("Execution queue is not compatible with the " + "allocation queue"); + } + + dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst); + + auto array_types = td_ns::usm_ndarray_types(); + int dst_typenum = dst.get_typenum(); + int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum); + + const py::ssize_t nelem = dst.get_size(); + const py::ssize_t rows = dst.get_shape(0); + const py::ssize_t cols = dst.get_shape(1); + if (rows == 0 || cols == 0) { + // nothing to do + return std::make_pair(sycl::event{}, sycl::event{}); + } + + bool is_dst_c_contig = dst.is_c_contiguous(); + bool is_dst_f_contig = dst.is_f_contiguous(); + if (!is_dst_c_contig && !is_dst_f_contig) { + throw py::value_error("USM array is not contiguous"); + } + + py::ssize_t start; + if (is_dst_c_contig) { + start = (k < 0) ? -k * cols : k; + } + else { + start = (k < 0) ? -k : k * rows; + } + + const py::ssize_t *strides = dst.get_strides_raw(); + py::ssize_t step; + if (strides == nullptr) { + step = (is_dst_c_contig) ? cols + 1 : rows + 1; + } + else { + step = strides[0] + strides[1]; + } + + const py::ssize_t length = std::min({rows, cols, rows + k, cols - k}); + const py::ssize_t end = start + step * (length - 1); + + char *dst_data = dst.get_data(); + sycl::event eye_event; + + auto fn = eye_dispatch_vector[dst_typeid]; + + eye_event = fn(exec_q, static_cast(nelem), start, end, step, + dst_data, depends); + + return std::make_pair(keep_args_alive(exec_q, {dst}, {eye_event}), + eye_event); +} + +void init_eye_ctor_dispatch_vectors(void) +{ + using namespace td_ns; + using dpctl::tensor::kernels::constructors::EyeFactory; + + DispatchVectorBuilder dvb; + dvb.populate_dispatch_vector(eye_dispatch_vector); + + return; +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/eye_ctor.hpp b/dpnp/tensor/libtensor/source/eye_ctor.hpp new file mode 100644 index 000000000000..dda7f2c4813a --- /dev/null +++ b/dpnp/tensor/libtensor/source/eye_ctor.hpp @@ -0,0 +1,57 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===--------------------------------------------------------------------===// + +#pragma once +#include +#include + +#include + +#include "dpnp4pybind11.hpp" +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern std::pair + usm_ndarray_eye(py::ssize_t k, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends = {}); + +extern void init_eye_ctor_dispatch_vectors(void); + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/full_ctor.cpp b/dpnp/tensor/libtensor/source/full_ctor.cpp new file mode 100644 index 000000000000..8d7fcd22b914 --- /dev/null +++ b/dpnp/tensor/libtensor/source/full_ctor.cpp @@ -0,0 +1,309 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===--------------------------------------------------------------------===// + +#include +#include +#include +#include +#include + +#include + +#include "dpnp4pybind11.hpp" +#include // py::cast> +#include + +#include "kernels/constructors.hpp" +#include "utils/offset_utils.hpp" +#include "utils/output_validation.hpp" +#include "utils/sycl_alloc_utils.hpp" +#include "utils/type_dispatch.hpp" +#include "utils/type_utils.hpp" + +#include "full_ctor.hpp" + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace dpctl::tensor::py_internal +{ + +using dpctl::utils::keep_args_alive; + +typedef sycl::event (*full_contig_fn_ptr_t)(sycl::queue &, + std::size_t, + const py::object &, + char *, + const std::vector &); + +/*! + * @brief Function to submit kernel to fill given contiguous memory allocation + * with specified value. + * + * @param exec_q Sycl queue to which kernel is submitted for execution. + * @param nelems Length of the sequence + * @param py_value Python object representing the value to fill the array with. + * Must be convertible to `dstTy`. + * @param dst_p Kernel accessible USM pointer to the start of array to be + * populated. + * @param depends List of events to wait for before starting computations, if + * any. + * + * @return Event to wait on to ensure that computation completes. + * @defgroup CtorKernels + */ +template +sycl::event full_contig_impl(sycl::queue &exec_q, + std::size_t nelems, + const py::object &py_value, + char *dst_p, + const std::vector &depends) +{ + dstTy fill_v = py::cast(py_value); + + sycl::event fill_ev; + + if constexpr (sizeof(dstTy) == sizeof(char)) { + const auto memset_val = sycl::bit_cast(fill_v); + fill_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + cgh.memset(reinterpret_cast(dst_p), memset_val, + nelems * sizeof(dstTy)); + }); + } + else { + bool is_zero = false; + if constexpr (sizeof(dstTy) == 1) { + is_zero = (std::uint8_t{0} == sycl::bit_cast(fill_v)); + } + else if constexpr (sizeof(dstTy) == 2) { + is_zero = + (std::uint16_t{0} == sycl::bit_cast(fill_v)); + } + else if constexpr (sizeof(dstTy) == 4) { + is_zero = + (std::uint32_t{0} == sycl::bit_cast(fill_v)); + } + else if constexpr (sizeof(dstTy) == 8) { + is_zero = + (std::uint64_t{0} == sycl::bit_cast(fill_v)); + } + else if constexpr (sizeof(dstTy) == 16) { + struct UInt128 + { + + constexpr UInt128() : v1{}, v2{} {} + UInt128(const UInt128 &) = default; + + operator bool() const { return bool(!v1) && bool(!v2); } + + std::uint64_t v1; + std::uint64_t v2; + }; + is_zero = static_cast(sycl::bit_cast(fill_v)); + } + + if (is_zero) { + static constexpr int memset_val = 0; + fill_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + cgh.memset(reinterpret_cast(dst_p), memset_val, + nelems * sizeof(dstTy)); + }); + } + else { + using dpctl::tensor::kernels::constructors::full_contig_impl; + + fill_ev = + full_contig_impl(exec_q, nelems, fill_v, dst_p, depends); + } + } + + return fill_ev; +} + +template +struct FullContigFactory +{ + fnT get() + { + fnT f = full_contig_impl; + return f; + } +}; + +typedef sycl::event (*full_strided_fn_ptr_t)(sycl::queue &, + int, + std::size_t, + py::ssize_t *, + const py::object &, + char *, + const std::vector &); + +/*! + * @brief Function to submit kernel to fill given strided memory allocation + * with specified value. + * + * @param exec_q Sycl queue to which kernel is submitted for execution. + * @param nd Array dimensionality + * @param nelems Length of the sequence + * @param shape_strides Kernel accessible USM pointer to packed shape and + * strides of array. + * @param py_value Python object representing the value to fill the array with. + * Must be convertible to `dstTy`. + * @param dst_p Kernel accessible USM pointer to the start of array to be + * populated. + * @param depends List of events to wait for before starting computations, if + * any. + * + * @return Event to wait on to ensure that computation completes. + * @defgroup CtorKernels + */ +template +sycl::event full_strided_impl(sycl::queue &exec_q, + int nd, + std::size_t nelems, + py::ssize_t *shape_strides, + const py::object &py_value, + char *dst_p, + const std::vector &depends) +{ + dstTy fill_v = py::cast(py_value); + + using dpctl::tensor::kernels::constructors::full_strided_impl; + sycl::event fill_ev = full_strided_impl( + exec_q, nd, nelems, shape_strides, fill_v, dst_p, depends); + + return fill_ev; +} + +template +struct FullStridedFactory +{ + fnT get() + { + fnT f = full_strided_impl; + return f; + } +}; + +static full_contig_fn_ptr_t full_contig_dispatch_vector[td_ns::num_types]; +static full_strided_fn_ptr_t full_strided_dispatch_vector[td_ns::num_types]; + +std::pair + usm_ndarray_full(const py::object &py_value, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends) +{ + // py_value should be coercible into data type of dst + + py::ssize_t dst_nelems = dst.get_size(); + + if (dst_nelems == 0) { + // nothing to do + return std::make_pair(sycl::event(), sycl::event()); + } + + if (!dpctl::utils::queues_are_compatible(exec_q, {dst})) { + throw py::value_error( + "Execution queue is not compatible with the allocation queue"); + } + + dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst); + + auto array_types = td_ns::usm_ndarray_types(); + int dst_typenum = dst.get_typenum(); + int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum); + + char *dst_data = dst.get_data(); + + if (dst_nelems == 1 || dst.is_c_contiguous() || dst.is_f_contiguous()) { + auto fn = full_contig_dispatch_vector[dst_typeid]; + + sycl::event full_contig_event = + fn(exec_q, static_cast(dst_nelems), py_value, dst_data, + depends); + + return std::make_pair( + keep_args_alive(exec_q, {dst}, {full_contig_event}), + full_contig_event); + } + else { + int nd = dst.get_ndim(); + auto const &dst_shape = dst.get_shape_vector(); + auto const &dst_strides = dst.get_strides_vector(); + + auto fn = full_strided_dispatch_vector[dst_typeid]; + + std::vector host_task_events; + host_task_events.reserve(2); + using dpctl::tensor::offset_utils::device_allocate_and_pack; + auto ptr_size_event_tuple = device_allocate_and_pack( + exec_q, host_task_events, dst_shape, dst_strides); + auto shape_strides_owner = std::move(std::get<0>(ptr_size_event_tuple)); + const sycl::event ©_shape_ev = std::get<2>(ptr_size_event_tuple); + py::ssize_t *shape_strides = shape_strides_owner.get(); + + const sycl::event &full_strided_ev = + fn(exec_q, nd, dst_nelems, shape_strides, py_value, dst_data, + {copy_shape_ev}); + + // free shape_strides + const auto &temporaries_cleanup_ev = + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {full_strided_ev}, shape_strides_owner); + host_task_events.push_back(temporaries_cleanup_ev); + + return std::make_pair(keep_args_alive(exec_q, {dst}, host_task_events), + full_strided_ev); + } +} + +void init_full_ctor_dispatch_vectors(void) +{ + using namespace td_ns; + + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(full_contig_dispatch_vector); + + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(full_strided_dispatch_vector); +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/full_ctor.hpp b/dpnp/tensor/libtensor/source/full_ctor.hpp new file mode 100644 index 000000000000..18c15de87a40 --- /dev/null +++ b/dpnp/tensor/libtensor/source/full_ctor.hpp @@ -0,0 +1,57 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===--------------------------------------------------------------------===// + +#pragma once +#include +#include + +#include + +#include "dpnp4pybind11.hpp" +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern std::pair + usm_ndarray_full(const py::object &py_value, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends = {}); + +extern void init_full_ctor_dispatch_vectors(void); + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/integer_advanced_indexing.cpp b/dpnp/tensor/libtensor/source/integer_advanced_indexing.cpp new file mode 100644 index 000000000000..c6021bdfd2d1 --- /dev/null +++ b/dpnp/tensor/libtensor/source/integer_advanced_indexing.cpp @@ -0,0 +1,814 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines implementation functions of dpctl.tensor.take and +/// dpctl.tensor.put +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "dpnp4pybind11.hpp" +#include + +#include "kernels/integer_advanced_indexing.hpp" +#include "utils/memory_overlap.hpp" +#include "utils/output_validation.hpp" +#include "utils/sycl_alloc_utils.hpp" +#include "utils/type_dispatch.hpp" +#include "utils/type_utils.hpp" + +#include "integer_advanced_indexing.hpp" + +#define INDEXING_MODES 2 +#define WRAP_MODE 0 +#define CLIP_MODE 1 + +namespace dpctl::tensor::py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +using dpctl::tensor::kernels::indexing::put_fn_ptr_t; +using dpctl::tensor::kernels::indexing::take_fn_ptr_t; + +static take_fn_ptr_t take_dispatch_table[INDEXING_MODES][td_ns::num_types] + [td_ns::num_types]; + +static put_fn_ptr_t put_dispatch_table[INDEXING_MODES][td_ns::num_types] + [td_ns::num_types]; + +namespace py = pybind11; + +using dpctl::utils::keep_args_alive; + +std::vector + _populate_kernel_params(sycl::queue &exec_q, + std::vector &host_task_events, + char **device_ind_ptrs, + py::ssize_t *device_ind_sh_st, + py::ssize_t *device_ind_offsets, + py::ssize_t *device_orthog_sh_st, + py::ssize_t *device_along_sh_st, + const py::ssize_t *inp_shape, + const py::ssize_t *arr_shape, + std::vector &inp_strides, + std::vector &arr_strides, + std::vector &ind_sh_sts, + std::vector &ind_ptrs, + std::vector &ind_offsets, + int axis_start, + int k, + int ind_nd, + int inp_nd, + int orthog_sh_elems, + int ind_sh_elems) +{ + + using usm_host_allocator_T = + dpctl::tensor::alloc_utils::usm_host_allocator; + using ptrT = std::vector; + + usm_host_allocator_T ptr_allocator(exec_q); + std::shared_ptr host_ind_ptrs_shp = + std::make_shared(k, ptr_allocator); + + using usm_host_allocatorT = + dpctl::tensor::alloc_utils::usm_host_allocator; + using shT = std::vector; + + usm_host_allocatorT sz_allocator(exec_q); + std::shared_ptr host_ind_sh_st_shp = + std::make_shared(ind_sh_elems * (k + 1), sz_allocator); + + std::shared_ptr host_ind_offsets_shp = + std::make_shared(k, sz_allocator); + + std::shared_ptr host_orthog_sh_st_shp = + std::make_shared(3 * orthog_sh_elems, sz_allocator); + + std::shared_ptr host_along_sh_st_shp = + std::make_shared(2 * (k + ind_sh_elems), sz_allocator); + + std::copy(ind_sh_sts.begin(), ind_sh_sts.end(), + host_ind_sh_st_shp->begin()); + std::copy(ind_ptrs.begin(), ind_ptrs.end(), host_ind_ptrs_shp->begin()); + std::copy(ind_offsets.begin(), ind_offsets.end(), + host_ind_offsets_shp->begin()); + + const sycl::event &device_ind_ptrs_copy_ev = exec_q.copy( + host_ind_ptrs_shp->data(), device_ind_ptrs, host_ind_ptrs_shp->size()); + + const sycl::event &device_ind_sh_st_copy_ev = + exec_q.copy(host_ind_sh_st_shp->data(), device_ind_sh_st, + host_ind_sh_st_shp->size()); + + const sycl::event &device_ind_offsets_copy_ev = exec_q.copy( + host_ind_offsets_shp->data(), device_ind_offsets, + host_ind_offsets_shp->size()); + + int orthog_nd = inp_nd - k; + + if (orthog_nd > 0) { + if (axis_start > 0) { + std::copy(inp_shape, inp_shape + axis_start, + host_orthog_sh_st_shp->begin()); + std::copy(inp_strides.begin(), inp_strides.begin() + axis_start, + host_orthog_sh_st_shp->begin() + orthog_sh_elems); + std::copy(arr_strides.begin(), arr_strides.begin() + axis_start, + host_orthog_sh_st_shp->begin() + 2 * orthog_sh_elems); + } + if (inp_nd > (axis_start + k)) { + std::copy(inp_shape + axis_start + k, inp_shape + inp_nd, + host_orthog_sh_st_shp->begin() + axis_start); + std::copy(inp_strides.begin() + axis_start + k, inp_strides.end(), + host_orthog_sh_st_shp->begin() + orthog_sh_elems + + axis_start); + + std::copy(arr_strides.begin() + axis_start + ind_nd, + arr_strides.end(), + host_orthog_sh_st_shp->begin() + 2 * orthog_sh_elems + + axis_start); + } + } + + if (inp_nd > 0) { + std::copy(inp_shape + axis_start, inp_shape + axis_start + k, + host_along_sh_st_shp->begin()); + + std::copy(inp_strides.begin() + axis_start, + inp_strides.begin() + axis_start + k, + host_along_sh_st_shp->begin() + k); + } + + if (ind_nd > 0) { + std::copy(arr_shape + axis_start, arr_shape + axis_start + ind_nd, + host_along_sh_st_shp->begin() + 2 * k); + std::copy(arr_strides.begin() + axis_start, + arr_strides.begin() + axis_start + ind_nd, + host_along_sh_st_shp->begin() + 2 * k + ind_nd); + } + + const sycl::event &device_orthog_sh_st_copy_ev = exec_q.copy( + host_orthog_sh_st_shp->data(), device_orthog_sh_st, + host_orthog_sh_st_shp->size()); + + const sycl::event &device_along_sh_st_copy_ev = exec_q.copy( + host_along_sh_st_shp->data(), device_along_sh_st, + host_along_sh_st_shp->size()); + + const sycl::event &shared_ptr_cleanup_ev = + exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on({device_along_sh_st_copy_ev, + device_orthog_sh_st_copy_ev, + device_ind_offsets_copy_ev, + device_ind_sh_st_copy_ev, device_ind_ptrs_copy_ev}); + cgh.host_task( + [host_ind_offsets_shp = std::move(host_ind_offsets_shp), + host_ind_sh_st_shp = std::move(host_ind_sh_st_shp), + host_ind_ptrs_shp = std::move(host_ind_ptrs_shp), + host_orthog_sh_st_shp = std::move(host_orthog_sh_st_shp), + host_along_sh_st_shp = std::move(host_along_sh_st_shp)] {}); + }); + host_task_events.push_back(shared_ptr_cleanup_ev); + + std::vector sh_st_pack_deps{ + device_ind_ptrs_copy_ev, device_ind_sh_st_copy_ev, + device_ind_offsets_copy_ev, device_orthog_sh_st_copy_ev, + device_along_sh_st_copy_ev}; + return sh_st_pack_deps; +} + +/* Utility to parse python object py_ind into vector of `usm_ndarray`s */ +std::vector parse_py_ind(const sycl::queue &q, + const py::object &py_ind) +{ + std::size_t ind_count = py::len(py_ind); + std::vector res; + res.reserve(ind_count); + + bool nd_is_known = false; + int nd = -1; + for (std::size_t i = 0; i < ind_count; ++i) { + py::object el_i = py_ind[py::cast(i)]; + dpctl::tensor::usm_ndarray arr_i = + py::cast(el_i); + if (!dpctl::utils::queues_are_compatible(q, {arr_i})) { + throw py::value_error("Index allocation queue is not compatible " + "with execution queue"); + } + if (nd_is_known) { + if (nd != arr_i.get_ndim()) { + throw py::value_error( + "Indices must have the same number of dimensions."); + } + } + else { + nd_is_known = true; + nd = arr_i.get_ndim(); + } + res.push_back(arr_i); + } + + return res; +} + +std::pair + usm_ndarray_take(const dpctl::tensor::usm_ndarray &src, + const py::object &py_ind, + const dpctl::tensor::usm_ndarray &dst, + int axis_start, + std::uint8_t mode, + sycl::queue &exec_q, + const std::vector &depends) +{ + std::vector ind = parse_py_ind(exec_q, py_ind); + + int k = ind.size(); + + if (k == 0) { + throw py::value_error("List of indices is empty."); + } + + if (axis_start < 0) { + throw py::value_error("Axis cannot be negative."); + } + + if (mode != 0 && mode != 1) { + throw py::value_error("Mode must be 0 or 1."); + } + + dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst); + + const dpctl::tensor::usm_ndarray ind_rep = ind[0]; + + int src_nd = src.get_ndim(); + int dst_nd = dst.get_ndim(); + int ind_nd = ind_rep.get_ndim(); + + auto sh_elems = std::max(src_nd, 1); + + if (axis_start + k > sh_elems) { + throw py::value_error("Axes are out of range for array of dimension " + + std::to_string(src_nd)); + } + if (src_nd == 0) { + if (dst_nd != ind_nd) { + throw py::value_error( + "Destination is not of appropriate dimension for take kernel."); + } + } + else { + if (dst_nd != (src_nd - k + ind_nd)) { + throw py::value_error( + "Destination is not of appropriate dimension for take kernel."); + } + } + + const py::ssize_t *src_shape = src.get_shape_raw(); + const py::ssize_t *dst_shape = dst.get_shape_raw(); + + bool orthog_shapes_equal(true); + std::size_t orthog_nelems(1); + for (int i = 0; i < (src_nd - k); ++i) { + auto idx1 = (i < axis_start) ? i : i + k; + auto idx2 = (i < axis_start) ? i : i + ind_nd; + + orthog_nelems *= static_cast(src_shape[idx1]); + orthog_shapes_equal = + orthog_shapes_equal && (src_shape[idx1] == dst_shape[idx2]); + } + + if (!orthog_shapes_equal) { + throw py::value_error( + "Axes of basic indices are not of matching shapes."); + } + + if (orthog_nelems == 0) { + return std::make_pair(sycl::event{}, sycl::event{}); + } + + char *src_data = src.get_data(); + char *dst_data = dst.get_data(); + + if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); + if (overlap(src, dst)) { + throw py::value_error("Array memory overlap."); + } + + py::ssize_t src_offset = py::ssize_t(0); + py::ssize_t dst_offset = py::ssize_t(0); + + int src_typenum = src.get_typenum(); + int dst_typenum = dst.get_typenum(); + + auto array_types = td_ns::usm_ndarray_types(); + int src_type_id = array_types.typenum_to_lookup_id(src_typenum); + int dst_type_id = array_types.typenum_to_lookup_id(dst_typenum); + + if (src_type_id != dst_type_id) { + throw py::type_error("Array data types are not the same."); + } + + const py::ssize_t *ind_shape = ind_rep.get_shape_raw(); + + int ind_typenum = ind_rep.get_typenum(); + int ind_type_id = array_types.typenum_to_lookup_id(ind_typenum); + + std::size_t ind_nelems(1); + for (int i = 0; i < ind_nd; ++i) { + ind_nelems *= static_cast(ind_shape[i]); + + if (!(ind_shape[i] == dst_shape[axis_start + i])) { + throw py::value_error( + "Indices shape does not match shape of axis in destination."); + } + } + + dpctl::tensor::validation::AmpleMemory::throw_if_not_ample( + dst, orthog_nelems * ind_nelems); + + int ind_sh_elems = std::max(ind_nd, 1); + + std::vector ind_ptrs; + ind_ptrs.reserve(k); + + std::vector ind_offsets; + ind_offsets.reserve(k); + + std::vector ind_sh_sts((k + 1) * ind_sh_elems, 0); + if (ind_nd > 0) { + std::copy(ind_shape, ind_shape + ind_nd, ind_sh_sts.begin()); + } + for (int i = 0; i < k; ++i) { + dpctl::tensor::usm_ndarray ind_ = ind[i]; + + if (!dpctl::utils::queues_are_compatible(exec_q, {ind_})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + // ndim, type, and shape are checked against the first array + if (i > 0) { + if (!(ind_.get_ndim() == ind_nd)) { + throw py::value_error("Index dimensions are not the same"); + } + + if (!(ind_type_id == + array_types.typenum_to_lookup_id(ind_.get_typenum()))) { + throw py::type_error( + "Indices array data types are not all the same."); + } + + const py::ssize_t *ind_shape_ = ind_.get_shape_raw(); + for (int dim = 0; dim < ind_nd; ++dim) { + if (!(ind_shape[dim] == ind_shape_[dim])) { + throw py::value_error("Indices shapes are not all equal."); + } + } + } + + // check for overlap with destination + if (overlap(dst, ind_)) { + throw py::value_error( + "Arrays index overlapping segments of memory"); + } + + char *ind_data = ind_.get_data(); + + // strides are initialized to 0 for 0D indices, so skip here + if (ind_nd > 0) { + auto ind_strides = ind_.get_strides_vector(); + std::copy(ind_strides.begin(), ind_strides.end(), + ind_sh_sts.begin() + (i + 1) * ind_nd); + } + + ind_ptrs.push_back(ind_data); + ind_offsets.push_back(py::ssize_t(0)); + } + + if (ind_nelems == 0) { + return std::make_pair(sycl::event{}, sycl::event{}); + } + + auto packed_ind_ptrs_owner = + dpctl::tensor::alloc_utils::smart_malloc_device(k, exec_q); + char **packed_ind_ptrs = packed_ind_ptrs_owner.get(); + + // rearrange to past where indices shapes are checked + // packed_ind_shapes_strides = [ind_shape, + // ind[0] strides, + // ..., + // ind[k] strides] + auto packed_ind_shapes_strides_owner = + dpctl::tensor::alloc_utils::smart_malloc_device( + (k + 1) * ind_sh_elems, exec_q); + py::ssize_t *packed_ind_shapes_strides = + packed_ind_shapes_strides_owner.get(); + + auto packed_ind_offsets_owner = + dpctl::tensor::alloc_utils::smart_malloc_device(k, exec_q); + py::ssize_t *packed_ind_offsets = packed_ind_offsets_owner.get(); + + int orthog_sh_elems = std::max(src_nd - k, 1); + + // packed_shapes_strides = [src_shape[:axis] + src_shape[axis+k:], + // src_strides[:axis] + src_strides[axis+k:], + // dst_strides[:axis] + + // dst_strides[axis+ind.ndim:]] + auto packed_shapes_strides_owner = + dpctl::tensor::alloc_utils::smart_malloc_device( + 3 * orthog_sh_elems, exec_q); + py::ssize_t *packed_shapes_strides = packed_shapes_strides_owner.get(); + + // packed_axes_shapes_strides = [src_shape[axis:axis+k], + // src_strides[axis:axis+k], + // dst_shape[axis:axis+ind.ndim], + // dst_strides[axis:axis+ind.ndim]] + auto packed_axes_shapes_strides_owner = + dpctl::tensor::alloc_utils::smart_malloc_device( + 2 * (k + ind_sh_elems), exec_q); + py::ssize_t *packed_axes_shapes_strides = + packed_axes_shapes_strides_owner.get(); + + auto src_strides = src.get_strides_vector(); + auto dst_strides = dst.get_strides_vector(); + + std::vector host_task_events; + host_task_events.reserve(2); + + std::vector pack_deps = _populate_kernel_params( + exec_q, host_task_events, packed_ind_ptrs, packed_ind_shapes_strides, + packed_ind_offsets, packed_shapes_strides, packed_axes_shapes_strides, + src_shape, dst_shape, src_strides, dst_strides, ind_sh_sts, ind_ptrs, + ind_offsets, axis_start, k, ind_nd, src_nd, orthog_sh_elems, + ind_sh_elems); + + std::vector all_deps; + all_deps.reserve(depends.size() + pack_deps.size()); + all_deps.insert(std::end(all_deps), std::begin(pack_deps), + std::end(pack_deps)); + all_deps.insert(std::end(all_deps), std::begin(depends), std::end(depends)); + + auto fn = take_dispatch_table[mode][src_type_id][ind_type_id]; + + if (fn == nullptr) { + sycl::event::wait(host_task_events); + throw std::runtime_error("Indices must be integer type, got " + + std::to_string(ind_type_id)); + } + + sycl::event take_generic_ev = + fn(exec_q, orthog_nelems, ind_nelems, orthog_sh_elems, ind_sh_elems, k, + packed_shapes_strides, packed_axes_shapes_strides, + packed_ind_shapes_strides, src_data, dst_data, packed_ind_ptrs, + src_offset, dst_offset, packed_ind_offsets, all_deps); + + // free packed temporaries + sycl::event temporaries_cleanup_ev = + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {take_generic_ev}, packed_shapes_strides_owner, + packed_axes_shapes_strides_owner, packed_ind_shapes_strides_owner, + packed_ind_ptrs_owner, packed_ind_offsets_owner); + host_task_events.push_back(temporaries_cleanup_ev); + + sycl::event arg_cleanup_ev = + keep_args_alive(exec_q, {src, py_ind, dst}, host_task_events); + + return std::make_pair(arg_cleanup_ev, take_generic_ev); +} + +std::pair + usm_ndarray_put(const dpctl::tensor::usm_ndarray &dst, + const py::object &py_ind, + const dpctl::tensor::usm_ndarray &val, + int axis_start, + std::uint8_t mode, + sycl::queue &exec_q, + const std::vector &depends) +{ + std::vector ind = parse_py_ind(exec_q, py_ind); + int k = ind.size(); + + if (k == 0) { + // no indices to write to + throw py::value_error("List of indices is empty."); + } + + if (axis_start < 0) { + throw py::value_error("Axis cannot be negative."); + } + + if (mode != 0 && mode != 1) { + throw py::value_error("Mode must be 0 or 1."); + } + + dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst); + + const dpctl::tensor::usm_ndarray ind_rep = ind[0]; + + int dst_nd = dst.get_ndim(); + int val_nd = val.get_ndim(); + int ind_nd = ind_rep.get_ndim(); + + auto sh_elems = std::max(dst_nd, 1); + + if (axis_start + k > sh_elems) { + throw py::value_error("Axes are out of range for array of dimension " + + std::to_string(dst_nd)); + } + if (dst_nd == 0) { + if (val_nd != ind_nd) { + throw py::value_error("Destination is not of appropriate dimension " + "for put function."); + } + } + else { + if (val_nd != (dst_nd - k + ind_nd)) { + throw py::value_error("Destination is not of appropriate dimension " + "for put function."); + } + } + + std::size_t dst_nelems = dst.get_size(); + + const py::ssize_t *dst_shape = dst.get_shape_raw(); + const py::ssize_t *val_shape = val.get_shape_raw(); + + bool orthog_shapes_equal(true); + std::size_t orthog_nelems(1); + for (int i = 0; i < (dst_nd - k); ++i) { + auto idx1 = (i < axis_start) ? i : i + k; + auto idx2 = (i < axis_start) ? i : i + ind_nd; + + orthog_nelems *= static_cast(dst_shape[idx1]); + orthog_shapes_equal = + orthog_shapes_equal && (dst_shape[idx1] == val_shape[idx2]); + } + + if (!orthog_shapes_equal) { + throw py::value_error( + "Axes of basic indices are not of matching shapes."); + } + + if (orthog_nelems == 0) { + return std::make_pair(sycl::event(), sycl::event()); + } + + char *dst_data = dst.get_data(); + char *val_data = val.get_data(); + + if (!dpctl::utils::queues_are_compatible(exec_q, {dst, val})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); + if (overlap(val, dst)) { + throw py::value_error("Arrays index overlapping segments of memory"); + } + + py::ssize_t dst_offset = py::ssize_t(0); + py::ssize_t val_offset = py::ssize_t(0); + + dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(dst, dst_nelems); + + int dst_typenum = dst.get_typenum(); + int val_typenum = val.get_typenum(); + + auto array_types = td_ns::usm_ndarray_types(); + int dst_type_id = array_types.typenum_to_lookup_id(dst_typenum); + int val_type_id = array_types.typenum_to_lookup_id(val_typenum); + + if (dst_type_id != val_type_id) { + throw py::type_error("Array data types are not the same."); + } + + const py::ssize_t *ind_shape = ind_rep.get_shape_raw(); + + int ind_typenum = ind_rep.get_typenum(); + int ind_type_id = array_types.typenum_to_lookup_id(ind_typenum); + + std::size_t ind_nelems(1); + for (int i = 0; i < ind_nd; ++i) { + ind_nelems *= static_cast(ind_shape[i]); + + if (!(ind_shape[i] == val_shape[axis_start + i])) { + throw py::value_error( + "Indices shapes does not match shape of axis in vals."); + } + } + + auto ind_sh_elems = std::max(ind_nd, 1); + + std::vector ind_ptrs; + ind_ptrs.reserve(k); + std::vector ind_offsets; + ind_offsets.reserve(k); + std::vector ind_sh_sts((k + 1) * ind_sh_elems, py::ssize_t(0)); + if (ind_nd > 0) { + std::copy(ind_shape, ind_shape + ind_sh_elems, ind_sh_sts.begin()); + } + for (int i = 0; i < k; ++i) { + dpctl::tensor::usm_ndarray ind_ = ind[i]; + + if (!dpctl::utils::queues_are_compatible(exec_q, {ind_})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + // ndim, type, and shape are checked against the first array + if (i > 0) { + if (!(ind_.get_ndim() == ind_nd)) { + throw py::value_error("Index dimensions are not the same"); + } + + if (!(ind_type_id == + array_types.typenum_to_lookup_id(ind_.get_typenum()))) { + throw py::type_error( + "Indices array data types are not all the same."); + } + + const py::ssize_t *ind_shape_ = ind_.get_shape_raw(); + for (int dim = 0; dim < ind_nd; ++dim) { + if (!(ind_shape[dim] == ind_shape_[dim])) { + throw py::value_error("Indices shapes are not all equal."); + } + } + } + + // check for overlap with destination + if (overlap(ind_, dst)) { + throw py::value_error( + "Arrays index overlapping segments of memory"); + } + + char *ind_data = ind_.get_data(); + + // strides are initialized to 0 for 0D indices, so skip here + if (ind_nd > 0) { + auto ind_strides = ind_.get_strides_vector(); + std::copy(ind_strides.begin(), ind_strides.end(), + ind_sh_sts.begin() + (i + 1) * ind_nd); + } + + ind_ptrs.push_back(ind_data); + ind_offsets.push_back(py::ssize_t(0)); + } + + if (ind_nelems == 0) { + return std::make_pair(sycl::event{}, sycl::event{}); + } + + auto packed_ind_ptrs_owner = + dpctl::tensor::alloc_utils::smart_malloc_device(k, exec_q); + char **packed_ind_ptrs = packed_ind_ptrs_owner.get(); + + // packed_ind_shapes_strides = [ind_shape, + // ind[0] strides, + // ..., + // ind[k] strides] + auto packed_ind_shapes_strides_owner = + dpctl::tensor::alloc_utils::smart_malloc_device( + (k + 1) * ind_sh_elems, exec_q); + py::ssize_t *packed_ind_shapes_strides = + packed_ind_shapes_strides_owner.get(); + + auto packed_ind_offsets_owner = + dpctl::tensor::alloc_utils::smart_malloc_device(k, exec_q); + py::ssize_t *packed_ind_offsets = packed_ind_offsets_owner.get(); + + int orthog_sh_elems = std::max(dst_nd - k, 1); + + // packed_shapes_strides = [dst_shape[:axis] + dst_shape[axis+k:], + // dst_strides[:axis] + dst_strides[axis+k:], + // val_strides[:axis] + + // val_strides[axis+ind.ndim:]] + auto packed_shapes_strides_owner = + dpctl::tensor::alloc_utils::smart_malloc_device( + 3 * orthog_sh_elems, exec_q); + py::ssize_t *packed_shapes_strides = packed_shapes_strides_owner.get(); + + // packed_axes_shapes_strides = [dst_shape[axis:axis+k], + // dst_strides[axis:axis+k], + // val_shape[axis:axis+ind.ndim], + // val_strides[axis:axis+ind.ndim]] + auto packed_axes_shapes_strides_owner = + dpctl::tensor::alloc_utils::smart_malloc_device( + 2 * (k + ind_sh_elems), exec_q); + py::ssize_t *packed_axes_shapes_strides = + packed_axes_shapes_strides_owner.get(); + + auto dst_strides = dst.get_strides_vector(); + auto val_strides = val.get_strides_vector(); + + std::vector host_task_events; + host_task_events.reserve(2); + + std::vector pack_deps = _populate_kernel_params( + exec_q, host_task_events, packed_ind_ptrs, packed_ind_shapes_strides, + packed_ind_offsets, packed_shapes_strides, packed_axes_shapes_strides, + dst_shape, val_shape, dst_strides, val_strides, ind_sh_sts, ind_ptrs, + ind_offsets, axis_start, k, ind_nd, dst_nd, orthog_sh_elems, + ind_sh_elems); + + std::vector all_deps; + all_deps.reserve(depends.size() + pack_deps.size()); + all_deps.insert(std::end(all_deps), std::begin(pack_deps), + std::end(pack_deps)); + all_deps.insert(std::end(all_deps), std::begin(depends), std::end(depends)); + + auto fn = put_dispatch_table[mode][dst_type_id][ind_type_id]; + + if (fn == nullptr) { + sycl::event::wait(host_task_events); + throw std::runtime_error("Indices must be integer type, got " + + std::to_string(ind_type_id)); + } + + sycl::event put_generic_ev = + fn(exec_q, orthog_nelems, ind_nelems, orthog_sh_elems, ind_sh_elems, k, + packed_shapes_strides, packed_axes_shapes_strides, + packed_ind_shapes_strides, dst_data, val_data, packed_ind_ptrs, + dst_offset, val_offset, packed_ind_offsets, all_deps); + + // free packed temporaries + sycl::event temporaries_cleanup_ev = + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {put_generic_ev}, packed_shapes_strides_owner, + packed_axes_shapes_strides_owner, packed_ind_shapes_strides_owner, + packed_ind_ptrs_owner, packed_ind_offsets_owner); + host_task_events.push_back(temporaries_cleanup_ev); + + sycl::event arg_cleanup_ev = + keep_args_alive(exec_q, {dst, py_ind, val}, host_task_events); + + return std::make_pair(arg_cleanup_ev, put_generic_ev); +} + +void init_advanced_indexing_dispatch_tables(void) +{ + using namespace td_ns; + + using dpctl::tensor::kernels::indexing::TakeClipFactory; + DispatchTableBuilder + dtb_takeclip; + dtb_takeclip.populate_dispatch_table(take_dispatch_table[CLIP_MODE]); + + using dpctl::tensor::kernels::indexing::TakeWrapFactory; + DispatchTableBuilder + dtb_takewrap; + dtb_takewrap.populate_dispatch_table(take_dispatch_table[WRAP_MODE]); + + using dpctl::tensor::kernels::indexing::PutClipFactory; + DispatchTableBuilder dtb_putclip; + dtb_putclip.populate_dispatch_table(put_dispatch_table[CLIP_MODE]); + + using dpctl::tensor::kernels::indexing::PutWrapFactory; + DispatchTableBuilder dtb_putwrap; + dtb_putwrap.populate_dispatch_table(put_dispatch_table[WRAP_MODE]); +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/integer_advanced_indexing.hpp b/dpnp/tensor/libtensor/source/integer_advanced_indexing.hpp new file mode 100644 index 000000000000..bc0136288e1c --- /dev/null +++ b/dpnp/tensor/libtensor/source/integer_advanced_indexing.hpp @@ -0,0 +1,71 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file declares Python API for implementation functions of +/// dpctl.tensor.take and dpctl.tensor.put +//===----------------------------------------------------------------------===// + +#pragma once +#include +#include +#include + +#include + +#include "dpnp4pybind11.hpp" +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern std::pair + usm_ndarray_take(const dpctl::tensor::usm_ndarray &, + const py::object &, + const dpctl::tensor::usm_ndarray &, + int, + std::uint8_t, + sycl::queue &, + const std::vector & = {}); + +extern std::pair + usm_ndarray_put(const dpctl::tensor::usm_ndarray &, + const py::object &, + const dpctl::tensor::usm_ndarray &, + int, + std::uint8_t, + sycl::queue &, + const std::vector & = {}); + +extern void init_advanced_indexing_dispatch_tables(void); + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/linalg_functions/dot.cpp b/dpnp/tensor/libtensor/source/linalg_functions/dot.cpp new file mode 100644 index 000000000000..9621ebc3277f --- /dev/null +++ b/dpnp/tensor/libtensor/source/linalg_functions/dot.cpp @@ -0,0 +1,834 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===--------------------------------------------------------------------===// + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "dpnp4pybind11.hpp" +#include +#include +#include + +#include "dot.hpp" +#include "dot_atomic_support.hpp" +#include "dot_dispatch.hpp" +#include "elementwise_functions/elementwise_functions_type_utils.hpp" +#include "kernels/linalg_functions/dot_product.hpp" +#include "kernels/linalg_functions/gemm.hpp" +#include "reductions/reduction_atomic_support.hpp" +#include "simplify_iteration_space.hpp" +#include "utils/memory_overlap.hpp" +#include "utils/offset_utils.hpp" +#include "utils/output_validation.hpp" +#include "utils/sycl_alloc_utils.hpp" +#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" + +namespace dpctl::tensor::py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +static int dot_output_id_table[td_ns::num_types][td_ns::num_types]; + +using dpctl::tensor::kernels::dot_product_impl_fn_ptr_t; +static dot_product_impl_fn_ptr_t dot_product_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +static dot_product_impl_fn_ptr_t + dot_product_temps_dispatch_table[td_ns::num_types][td_ns::num_types]; + +using dpctl::tensor::kernels::dot_product_contig_impl_fn_ptr_t; +static dot_product_contig_impl_fn_ptr_t + dot_product_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; + +static dot_product_contig_impl_fn_ptr_t + dot_product_contig_temps_dispatch_table[td_ns::num_types][td_ns::num_types]; + +using dpctl::tensor::kernels::gemm_impl_fn_ptr_t; +static gemm_impl_fn_ptr_t gemm_atomic_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +static gemm_impl_fn_ptr_t gemm_temps_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +using dpctl::tensor::kernels::gemm_contig_impl_fn_ptr_t; +static gemm_contig_impl_fn_ptr_t + gemm_contig_atomic_dispatch_table[td_ns::num_types][td_ns::num_types]; + +static gemm_contig_impl_fn_ptr_t + gemm_contig_temps_dispatch_table[td_ns::num_types][td_ns::num_types]; + +using dpctl::tensor::kernels::gemm_batch_impl_fn_ptr_t; +static gemm_batch_impl_fn_ptr_t + gemm_batch_atomic_dispatch_table[td_ns::num_types][td_ns::num_types]; + +static gemm_batch_impl_fn_ptr_t + gemm_batch_temps_dispatch_table[td_ns::num_types][td_ns::num_types]; + +using dpctl::tensor::kernels::gemm_batch_contig_impl_fn_ptr_t; +static gemm_batch_contig_impl_fn_ptr_t + gemm_batch_contig_atomic_dispatch_table[td_ns::num_types][td_ns::num_types]; + +static gemm_batch_contig_impl_fn_ptr_t + gemm_batch_contig_temps_dispatch_table[td_ns::num_types][td_ns::num_types]; + +void init_dot_dispatch_tables(void) +{ + td_ns::DispatchTableBuilder dtb1; + dtb1.populate_dispatch_table(dot_output_id_table); + + td_ns::DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(gemm_batch_atomic_dispatch_table); + + td_ns::DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table(gemm_batch_contig_atomic_dispatch_table); + + td_ns::DispatchTableBuilder + dtb4; + dtb4.populate_dispatch_table(gemm_atomic_dispatch_table); + + td_ns::DispatchTableBuilder + dtb5; + dtb5.populate_dispatch_table(gemm_contig_atomic_dispatch_table); + + td_ns::DispatchTableBuilder + dtb6; + dtb6.populate_dispatch_table(gemm_batch_temps_dispatch_table); + + td_ns::DispatchTableBuilder + dtb7; + dtb7.populate_dispatch_table(gemm_batch_contig_temps_dispatch_table); + + td_ns::DispatchTableBuilder + dtb8; + dtb8.populate_dispatch_table(gemm_temps_dispatch_table); + + td_ns::DispatchTableBuilder + dtb9; + dtb9.populate_dispatch_table(gemm_contig_temps_dispatch_table); + + td_ns::DispatchTableBuilder + dtb10; + dtb10.populate_dispatch_table(dot_product_dispatch_table); + + td_ns::DispatchTableBuilder + dtb11; + dtb11.populate_dispatch_table(dot_product_temps_dispatch_table); + + td_ns::DispatchTableBuilder + dtb12; + dtb12.populate_dispatch_table(dot_product_contig_dispatch_table); + + td_ns::DispatchTableBuilder + dtb13; + dtb13.populate_dispatch_table(dot_product_contig_temps_dispatch_table); +} + +using atomic_support::atomic_support_fn_ptr_t; +static atomic_support_fn_ptr_t dot_atomic_support_vector[td_ns::num_types]; + +void init_dot_atomic_support_vector(void) +{ + + using atomic_support::DotAtomicSupportFactory; + td_ns::DispatchVectorBuilder + dvb; + dvb.populate_dispatch_vector(dot_atomic_support_vector); +} + +std::pair + py_dot(const dpctl::tensor::usm_ndarray &x1, + const dpctl::tensor::usm_ndarray &x2, + int batch_dims, + int x1_outer_dims, + int x2_outer_dims, + int inner_dims, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends) +{ + if (!dpctl::utils::queues_are_compatible(exec_q, {x1, x2, dst})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst); + + if (inner_dims == 0) { + throw py::value_error("No inner dimension for dot"); + } + + int x1_nd = x1.get_ndim(); + int x2_nd = x2.get_ndim(); + if (x1_nd != (batch_dims + x1_outer_dims + inner_dims) || + x2_nd != (batch_dims + x2_outer_dims + inner_dims)) { + throw py::value_error("Input arrays do not have dimensions consistent " + "with input dimensions"); + } + + int dst_nd = dst.get_ndim(); + if (dst_nd != (batch_dims + x1_outer_dims + x2_outer_dims)) { + throw py::value_error("Destination array rank does not match input " + "array rank and number of input dimensions"); + } + + const py::ssize_t *x1_shape_ptr = x1.get_shape_raw(); + const py::ssize_t *x2_shape_ptr = x2.get_shape_raw(); + const py::ssize_t *dst_shape_ptr = dst.get_shape_raw(); + + bool same_shapes = true; + std::size_t batches(1); + for (int i = 0; same_shapes && (i < batch_dims); ++i) { + same_shapes = same_shapes && (x1_shape_ptr[i] == dst_shape_ptr[i]) && + (x2_shape_ptr[i] == dst_shape_ptr[i]); + batches *= x1_shape_ptr[i]; + } + std::size_t x1_outer_nelems(1); + for (int i = batch_dims; same_shapes && (i < (batch_dims + x1_outer_dims)); + ++i) { + same_shapes = same_shapes && (x1_shape_ptr[i] == dst_shape_ptr[i]); + x1_outer_nelems *= x1_shape_ptr[i]; + } + std::size_t inner_nelems(1); + for (int i = batch_dims; i < (batch_dims + inner_dims); ++i) { + auto x1_shape_idx = x1_outer_dims + i; + same_shapes = + same_shapes && (x1_shape_ptr[x1_shape_idx] == x2_shape_ptr[i]); + inner_nelems *= x1_shape_ptr[x1_shape_idx]; + } + std::size_t x2_outer_nelems(1); + for (int i = 0; same_shapes && (i < x2_outer_dims); ++i) { + auto x2_shape_idx = batch_dims + inner_dims + i; + same_shapes = + same_shapes && (x2_shape_ptr[x2_shape_idx] == + dst_shape_ptr[batch_dims + x1_outer_dims + i]); + x2_outer_nelems *= x2_shape_ptr[x2_shape_idx]; + } + if (!same_shapes) { + throw py::value_error("Input arrays to tensor dot product do not have " + "appropriate shapes"); + } + + std::size_t dst_nelems = batches * x1_outer_nelems * x2_outer_nelems; + if (dst_nelems == 0) { + return std::make_pair(sycl::event(), sycl::event()); + } + + if (static_cast(dst.get_size()) != dst_nelems) { + throw py::value_error("dst shape and size mismatch"); + } + + dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(dst, dst_nelems); + + auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); + // check that dst does not intersect with x1 or x2 + if (overlap(dst, x1) || overlap(dst, x2)) { + throw py::value_error("Result array overlaps with inputs"); + } + + int x1_typenum = x1.get_typenum(); + int x2_typenum = x2.get_typenum(); + int dst_typenum = dst.get_typenum(); + + auto const &array_types = td_ns::usm_ndarray_types(); + int x1_typeid = array_types.typenum_to_lookup_id(x1_typenum); + int x2_typeid = array_types.typenum_to_lookup_id(x2_typenum); + int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum); + + int output_typeid = dot_output_id_table[x1_typeid][x2_typeid]; + + if (output_typeid != dst_typeid) { + throw py::value_error( + "Result array has unexpected elemental data type."); + } + + void *data_ptr = dst.get_data(); + const auto &ctx = exec_q.get_context(); + auto usm_type = sycl::get_pointer_type(data_ptr, ctx); + bool supports_atomics = + dot_atomic_support_vector[output_typeid](exec_q, usm_type); + + const char *x1_data = x1.get_data(); + const char *x2_data = x2.get_data(); + char *dst_data = dst.get_data(); + + const auto &x1_shape_vec = x1.get_shape_vector(); + const auto &x1_strides_vec = x1.get_strides_vector(); + + const auto &x2_shape_vec = x2.get_shape_vector(); + const auto &x2_strides_vec = x2.get_strides_vector(); + + const auto &dst_shape_vec = dst.get_shape_vector(); + const auto &dst_strides_vec = dst.get_strides_vector(); + + bool is_x1_c_contig = x1.is_c_contiguous(); + bool is_x1_f_contig = x1.is_f_contiguous(); + bool is_x2_c_contig = x2.is_c_contiguous(); + bool is_x2_f_contig = x2.is_f_contiguous(); + bool is_dst_c_contig = dst.is_c_contiguous(); + + bool call_vecdot = ((x1_outer_dims == 0 && x1_outer_nelems == 1) && + (x2_outer_dims == 0 && x2_outer_nelems == 1)); + + bool call_batched = (batch_dims != 0 || batches > 1); + std::vector host_task_events{}; + sycl::event dot_ev; + if (call_vecdot) { + if ((is_x1_c_contig && is_x2_c_contig && is_dst_c_contig) || + ((is_x1_f_contig && is_x2_f_contig) && !call_batched)) { + dot_product_contig_impl_fn_ptr_t fn = nullptr; + if (supports_atomics) { + fn = dot_product_contig_dispatch_table[x1_typeid][x2_typeid]; + } + else { + fn = dot_product_contig_temps_dispatch_table[x1_typeid] + [x2_typeid]; + } + if (fn != nullptr) { + static constexpr py::ssize_t zero_offset = 0; + dot_ev = fn(exec_q, dst_nelems, inner_nelems, x1.get_data(), + x2.get_data(), dst.get_data(), + zero_offset, // lhs batch offset + zero_offset, // rhs batch offset + zero_offset, // res batch offset + zero_offset, // lhs reduction offset + zero_offset, // rhs reduction offset + depends); + return std::make_pair(dpctl::utils::keep_args_alive( + exec_q, {x1, x2, dst}, {dot_ev}), + dot_ev); + } + } + int inner_nd = inner_dims; + const py::ssize_t *inner_shape_ptr = x1_shape_ptr + batch_dims; + using shT = std::vector; + const shT inner_x1_strides(std::begin(x1_strides_vec) + batch_dims, + std::end(x1_strides_vec)); + const shT inner_x2_strides(std::begin(x2_strides_vec) + batch_dims, + std::end(x2_strides_vec)); + + shT simplified_inner_shape; + shT simplified_inner_x1_strides; + shT simplified_inner_x2_strides; + py::ssize_t inner_x1_offset(0); + py::ssize_t inner_x2_offset(0); + + simplify_iteration_space( + inner_nd, inner_shape_ptr, inner_x1_strides, inner_x2_strides, + // output + simplified_inner_shape, simplified_inner_x1_strides, + simplified_inner_x2_strides, inner_x1_offset, inner_x2_offset); + + const py::ssize_t *batch_shape_ptr = x1_shape_ptr; + + const shT batch_x1_strides(std::begin(x1_strides_vec), + std::begin(x1_strides_vec) + batch_dims); + const shT batch_x2_strides(std::begin(x2_strides_vec), + std::begin(x2_strides_vec) + batch_dims); + shT const &batch_dst_strides = dst_strides_vec; + + shT simplified_batch_shape; + shT simplified_batch_x1_strides; + shT simplified_batch_x2_strides; + shT simplified_batch_dst_strides; + py::ssize_t batch_x1_offset(0); + py::ssize_t batch_x2_offset(0); + py::ssize_t batch_dst_offset(0); + + if (batch_dims == 0) { + if (dst_nelems != 1) { + throw std::runtime_error( + "batch_dims == 0, but dst_nelems != 1"); + } + batch_dims = 1; + simplified_batch_shape.push_back(1); + simplified_batch_x1_strides.push_back(0); + simplified_batch_x2_strides.push_back(0); + simplified_batch_dst_strides.push_back(0); + } + else { + simplify_iteration_space_3( + batch_dims, batch_shape_ptr, batch_x1_strides, batch_x2_strides, + batch_dst_strides, + // output + simplified_batch_shape, simplified_batch_x1_strides, + simplified_batch_x2_strides, simplified_batch_dst_strides, + batch_x1_offset, batch_x2_offset, batch_dst_offset); + } + + if (inner_nd == 1 && batch_dims == 1) { + bool dot_product_c_contig = false; + bool reduce_all_elems = false; + + if (simplified_inner_x1_strides[0] == 1 && + simplified_inner_x2_strides[0] == 1) { + reduce_all_elems = (simplified_batch_shape[0] == 1); + dot_product_c_contig = + (simplified_batch_dst_strides[0] == 1) && + (static_cast(simplified_batch_x1_strides[0]) == + inner_nelems) && + (static_cast(simplified_batch_x2_strides[0]) == + inner_nelems); + } + + if (dot_product_c_contig || reduce_all_elems) { + dot_product_contig_impl_fn_ptr_t fn = nullptr; + if (supports_atomics) { + fn = + dot_product_contig_dispatch_table[x1_typeid][x2_typeid]; + } + else { + fn = dot_product_contig_temps_dispatch_table[x1_typeid] + [x2_typeid]; + } + if (fn != nullptr) { + dot_ev = fn(exec_q, dst_nelems, inner_nelems, x1.get_data(), + x2.get_data(), dst.get_data(), + batch_x1_offset, // lhs batch offset + batch_x2_offset, // rhs batch offset + batch_dst_offset, // res batch offset + inner_x1_offset, // lhs reduction offset + inner_x2_offset, // rhs reduction offset + depends); + return std::make_pair(dpctl::utils::keep_args_alive( + exec_q, {x1, x2, dst}, {dot_ev}), + dot_ev); + } + } + } + + dot_product_impl_fn_ptr_t fn = nullptr; + if (supports_atomics) { + fn = dot_product_dispatch_table[x1_typeid][x2_typeid]; + } + if (fn == nullptr) { + fn = dot_product_temps_dispatch_table[x1_typeid][x2_typeid]; + if (fn == nullptr) { + throw std::runtime_error( + "Implementation is missing for x1_typeid=" + + std::to_string(x1_typeid) + + " and x2_typeid=" + std::to_string(x2_typeid)); + } + } + + using dpctl::tensor::offset_utils::device_allocate_and_pack; + auto arrays_metainfo_packing_triple_ = + device_allocate_and_pack( + exec_q, host_task_events, + // iteration metadata + simplified_batch_shape, simplified_batch_x1_strides, + simplified_batch_x2_strides, simplified_batch_dst_strides, + // reduction metadata + simplified_inner_shape, simplified_inner_x1_strides, + simplified_inner_x2_strides); + auto tmp_alloc_owner = + std::move(std::get<0>(arrays_metainfo_packing_triple_)); + const auto ©_metadata_ev = + std::get<2>(arrays_metainfo_packing_triple_); + const py::ssize_t *temp_allocation_ptr = tmp_alloc_owner.get(); + + const py::ssize_t *iter_shape_and_strides = temp_allocation_ptr; + const py::ssize_t *inner_shape_stride = + temp_allocation_ptr + 4 * simplified_batch_shape.size(); + + std::vector all_deps; + all_deps.reserve(depends.size() + 1); + all_deps.resize(depends.size()); + std::copy(depends.begin(), depends.end(), all_deps.begin()); + all_deps.push_back(copy_metadata_ev); + + dot_ev = + fn(exec_q, dst_nelems, inner_nelems, x1.get_data(), x2.get_data(), + dst.get_data(), batch_dims, iter_shape_and_strides, + batch_x1_offset, batch_x2_offset, batch_dst_offset, + inner_nd, // number dimensions being reduced + inner_shape_stride, inner_x1_offset, inner_x2_offset, all_deps); + + sycl::event temp_cleanup_ev = + dpctl::tensor::alloc_utils::async_smart_free(exec_q, {dot_ev}, + tmp_alloc_owner); + host_task_events.push_back(temp_cleanup_ev); + } + else { // if (!call_vecdot) + if (!call_batched) { + if ((is_x1_c_contig && is_x2_c_contig && is_dst_c_contig)) { + gemm_contig_impl_fn_ptr_t fn = nullptr; + if (supports_atomics) { + fn = + gemm_contig_atomic_dispatch_table[x1_typeid][x2_typeid]; + } + else { + fn = gemm_contig_temps_dispatch_table[x1_typeid][x2_typeid]; + } + if (fn != nullptr) { + dot_ev = fn(exec_q, x1_data, x2_data, dst_data, + x1_outer_nelems, // n + inner_nelems, // k + x2_outer_nelems, // m + depends); + return std::make_pair(dpctl::utils::keep_args_alive( + exec_q, {x1, x2, dst}, {dot_ev}), + dot_ev); + } + } + gemm_impl_fn_ptr_t fn = nullptr; + if (supports_atomics) { + fn = gemm_atomic_dispatch_table[x1_typeid][x2_typeid]; + } + if (fn == nullptr) { + fn = gemm_temps_dispatch_table[x1_typeid][x2_typeid]; + if (fn == nullptr) { + throw std::runtime_error( + "Implementation is missing for x1_typeid=" + + std::to_string(x1_typeid) + + " and x2_typeid=" + std::to_string(x2_typeid)); + } + } + using dpctl::tensor::offset_utils::device_allocate_and_pack; + auto ptr_size_event_tuple1 = device_allocate_and_pack( + exec_q, host_task_events, x1_shape_vec, x1_strides_vec, + x2_shape_vec, x2_strides_vec, dst_shape_vec, dst_strides_vec); + auto packed_shapes_strides_owner = + std::move(std::get<0>(ptr_size_event_tuple1)); + sycl::event copy_shapes_strides_ev = + std::get<2>(ptr_size_event_tuple1); + const py::ssize_t *packed_shapes_strides = + packed_shapes_strides_owner.get(); + + const py::ssize_t *x1_shape_strides = packed_shapes_strides; + const py::ssize_t *x2_shape_strides = + packed_shapes_strides + 2 * (x1_nd); + const py::ssize_t *dst_shape_strides = + packed_shapes_strides + 2 * (x1_nd + x2_nd); + + std::vector all_deps; + all_deps.reserve(depends.size() + 1); + all_deps.insert(all_deps.end(), depends.begin(), depends.end()); + all_deps.push_back(copy_shapes_strides_ev); + + // change gemm calls to pass inner dims and outer dims separately + dot_ev = + fn(exec_q, x1_data, x2_data, dst_data, x1_outer_nelems, + inner_nelems, x2_outer_nelems, inner_dims, x1_outer_dims, + x1_shape_strides, x2_outer_dims, x2_shape_strides, + x1_outer_dims + x2_outer_dims, dst_shape_strides, all_deps); + + sycl::event cleanup_tmp_allocations_ev = + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {dot_ev}, packed_shapes_strides_owner); + host_task_events.push_back(cleanup_tmp_allocations_ev); + } + else { // if (call_batched) + using shT = std::vector; + // temporary asserts for matmul + assert(x1_outer_dims == 1); + assert(x2_outer_dims == 1); + assert(inner_dims == 1); + + if ((is_x1_c_contig && is_x2_c_contig && is_dst_c_contig)) { + gemm_batch_contig_impl_fn_ptr_t fn = nullptr; + if (supports_atomics) { + fn = gemm_batch_contig_atomic_dispatch_table[x1_typeid] + [x2_typeid]; + } + else { + fn = gemm_batch_contig_temps_dispatch_table[x1_typeid] + [x2_typeid]; + } + if (fn != nullptr) { + static constexpr py::ssize_t zero_offset = 0; + dot_ev = fn(exec_q, x1_data, x2_data, dst_data, batches, + x1_outer_nelems, // n + inner_nelems, // k + x2_outer_nelems, // m + zero_offset, zero_offset, zero_offset, depends); + return std::make_pair(dpctl::utils::keep_args_alive( + exec_q, {x1, x2, dst}, {dot_ev}), + dot_ev); + } + } + + auto x1_outer_inner_dims = x1_nd - batch_dims; + auto x2_outer_inner_dims = x2_nd - batch_dims; + auto dst_outer_inner_dims = dst_nd - batch_dims; + + shT batch_x1_shape; + shT outer_inner_x1_shape; + shT batch_x1_strides; + shT outer_inner_x1_strides; + split_iteration_space(x1_shape_vec, x1_strides_vec, batch_dims, + batch_dims + x1_outer_inner_dims, + // 4 vectors modified + batch_x1_shape, outer_inner_x1_shape, + batch_x1_strides, outer_inner_x1_strides); + + shT batch_x2_shape; + shT outer_inner_x2_shape; + shT batch_x2_strides; + shT outer_inner_x2_strides; + split_iteration_space(x2_shape_vec, x2_strides_vec, batch_dims, + batch_dims + x2_outer_inner_dims, + // 4 vectors modified + batch_x2_shape, outer_inner_x2_shape, + batch_x2_strides, outer_inner_x2_strides); + + shT batch_dst_shape; + shT outer_inner_dst_shape; + shT batch_dst_strides; + shT outer_inner_dst_strides; + split_iteration_space(dst_shape_vec, dst_strides_vec, batch_dims, + batch_dims + dst_outer_inner_dims, + // 4 vectors modified + batch_dst_shape, outer_inner_dst_shape, + batch_dst_strides, outer_inner_dst_strides); + + using shT = std::vector; + shT simplified_batch_shape; + shT simplified_batch_x1_strides; + shT simplified_batch_x2_strides; + shT simplified_batch_dst_strides; + py::ssize_t x1_batch_offset(0); + py::ssize_t x2_batch_offset(0); + py::ssize_t dst_batch_offset(0); + + const py::ssize_t *shape = x1_shape_ptr; + + simplify_iteration_space_3( + batch_dims, shape, batch_x1_strides, batch_x2_strides, + batch_dst_strides, + // outputs + simplified_batch_shape, simplified_batch_x1_strides, + simplified_batch_x2_strides, simplified_batch_dst_strides, + x1_batch_offset, x2_batch_offset, dst_batch_offset); + + if (batch_dims == 1 && x1_outer_dims == 1 && x2_outer_dims == 1 && + inner_dims == 1) { + bool gemm_batch_c_contig = false; + + if ((static_cast(outer_inner_x1_strides[0]) == + inner_nelems && + outer_inner_x1_strides[1] == 1) && + (static_cast(outer_inner_x2_strides[0]) == + inner_nelems && + outer_inner_x2_strides[1] == 1) && + (static_cast(outer_inner_dst_strides[0]) == + x2_outer_nelems && + outer_inner_dst_strides[1] == 1)) { + gemm_batch_c_contig = + (static_cast( + simplified_batch_x1_strides[0]) == + x1_outer_nelems * inner_nelems) && + (static_cast( + simplified_batch_x2_strides[0]) == + x2_outer_nelems * inner_nelems) && + (static_cast( + simplified_batch_dst_strides[0]) == + x1_outer_nelems * x2_outer_nelems); + } + + if (gemm_batch_c_contig) { + gemm_batch_contig_impl_fn_ptr_t fn = nullptr; + if (supports_atomics) { + fn = gemm_batch_contig_atomic_dispatch_table[x1_typeid] + [x2_typeid]; + } + else { + fn = gemm_batch_contig_temps_dispatch_table[x1_typeid] + [x2_typeid]; + } + if (fn != nullptr) { + dot_ev = fn(exec_q, x1_data, x2_data, dst_data, batches, + x1_outer_nelems, // n + inner_nelems, // k + x2_outer_nelems, // m + x1_batch_offset, x2_batch_offset, + dst_batch_offset, depends); + return std::make_pair( + dpctl::utils::keep_args_alive(exec_q, {x1, x2, dst}, + {dot_ev}), + dot_ev); + } + } + } + + gemm_batch_impl_fn_ptr_t fn = nullptr; + if (supports_atomics) { + fn = gemm_batch_atomic_dispatch_table[x1_typeid][x2_typeid]; + } + if (fn == nullptr) { + fn = gemm_batch_temps_dispatch_table[x1_typeid][x2_typeid]; + if (fn == nullptr) { + throw std::runtime_error( + "Implementation is missing for x1_typeid=" + + std::to_string(x1_typeid) + + " and x2_typeid=" + std::to_string(x2_typeid)); + } + } + using dpctl::tensor::offset_utils::device_allocate_and_pack; + auto ptr_size_event_tuple1 = device_allocate_and_pack( + exec_q, host_task_events, simplified_batch_shape, + simplified_batch_x1_strides, simplified_batch_x2_strides, + simplified_batch_dst_strides, outer_inner_x1_shape, + outer_inner_x1_strides, outer_inner_x2_shape, + outer_inner_x2_strides, outer_inner_dst_shape, + outer_inner_dst_strides, + // full shape and strides of the result array + // necessary for reduction and initialization + simplified_batch_shape, outer_inner_dst_shape, + simplified_batch_dst_strides, outer_inner_dst_strides); + auto packed_shapes_strides_owner = + std::move(std::get<0>(ptr_size_event_tuple1)); + sycl::event copy_shapes_strides_ev = + std::get<2>(ptr_size_event_tuple1); + const py::ssize_t *packed_shapes_strides = + packed_shapes_strides_owner.get(); + + const auto batch_shape_strides = packed_shapes_strides; + const auto x1_outer_inner_shapes_strides = + packed_shapes_strides + 4 * batch_dims; + const auto x2_outer_inner_shapes_strides = + packed_shapes_strides + 4 * batch_dims + + 2 * (x1_outer_inner_dims); + const auto dst_outer_shapes_strides = + packed_shapes_strides + 4 * batch_dims + + 2 * (x1_outer_inner_dims) + 2 * (x2_outer_inner_dims); + const auto dst_full_shape_strides = + packed_shapes_strides + 4 * batch_dims + + 2 * (x1_outer_inner_dims) + 2 * (x2_outer_inner_dims) + + 2 * (dst_outer_inner_dims); + + std::vector all_deps; + all_deps.reserve(depends.size() + 1); + all_deps.insert(all_deps.end(), depends.begin(), depends.end()); + all_deps.push_back(copy_shapes_strides_ev); + + dot_ev = fn( + exec_q, x1_data, x2_data, dst_data, batches, x1_outer_nelems, + inner_nelems, x2_outer_nelems, batch_dims, batch_shape_strides, + x1_batch_offset, x2_batch_offset, dst_batch_offset, inner_dims, + x1_outer_dims, x1_outer_inner_shapes_strides, x2_outer_dims, + x2_outer_inner_shapes_strides, x1_outer_dims + x2_outer_dims, + dst_outer_shapes_strides, dst_full_shape_strides, all_deps); + + sycl::event cleanup_tmp_allocations_ev = + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {dot_ev}, packed_shapes_strides_owner); + host_task_events.push_back(cleanup_tmp_allocations_ev); + } + } + return std::make_pair( + dpctl::utils::keep_args_alive(exec_q, {x1, x2, dst}, host_task_events), + dot_ev); +} + +template +py::object py_dot_result_type(const py::dtype &input1_dtype, + const py::dtype &input2_dtype, + const output_typesT &output_types_table) +{ + int tn1 = input1_dtype.num(); // NumPy type numbers are the same as in dpctl + int tn2 = input2_dtype.num(); // NumPy type numbers are the same as in dpctl + int src1_typeid = -1; + int src2_typeid = -1; + + auto array_types = td_ns::usm_ndarray_types(); + + try { + src1_typeid = array_types.typenum_to_lookup_id(tn1); + src2_typeid = array_types.typenum_to_lookup_id(tn2); + } catch (const std::exception &e) { + throw py::value_error(e.what()); + } + + if (src1_typeid < 0 || src1_typeid >= td_ns::num_types || src2_typeid < 0 || + src2_typeid >= td_ns::num_types) { + throw std::runtime_error("binary output type lookup failed"); + } + int dst_typeid = output_types_table[src1_typeid][src2_typeid]; + + if (dst_typeid < 0) { + auto res = py::none(); + return py::cast(res); + } + else { + auto dst_typenum_t = static_cast(dst_typeid); + auto dt = type_utils::_dtype_from_typenum(dst_typenum_t); + + return py::cast(dt); + } +} + +void init_dot(py::module_ m) +{ + init_dot_atomic_support_vector(); + init_dot_dispatch_tables(); + + m.def("_dot", &py_dot, "", py::arg("x1"), py::arg("x2"), + py::arg("batch_dims"), py::arg("x1_outer_dims"), + py::arg("x2_outer_dims"), py::arg("inner_dims"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto dot_result_type_pyapi = [&](const py::dtype &dtype1, + const py::dtype &dtype2) { + return py_dot_result_type(dtype1, dtype2, dot_output_id_table); + }; + m.def("_dot_result_type", dot_result_type_pyapi, ""); +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/linalg_functions/dot.hpp b/dpnp/tensor/libtensor/source/linalg_functions/dot.hpp new file mode 100644 index 000000000000..f6a23ace5cd9 --- /dev/null +++ b/dpnp/tensor/libtensor/source/linalg_functions/dot.hpp @@ -0,0 +1,45 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===--------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern void init_dot(py::module_ m); + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/linalg_functions/dot_atomic_support.hpp b/dpnp/tensor/libtensor/source/linalg_functions/dot_atomic_support.hpp new file mode 100644 index 000000000000..66b9b5004575 --- /dev/null +++ b/dpnp/tensor/libtensor/source/linalg_functions/dot_atomic_support.hpp @@ -0,0 +1,58 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===--------------------------------------------------------------------===// + +#pragma once + +#include "reductions/reduction_atomic_support.hpp" +#include "utils/type_utils.hpp" + +namespace dpctl::tensor::py_internal::atomic_support +{ + +template +struct DotAtomicSupportFactory +{ + fnT get() + { + using dpctl::tensor::type_utils::is_complex; + if constexpr (is_complex::value) { + return atomic_support::fixed_decision; + } + else { + return atomic_support::check_atomic_support; + } + } +}; + +} // namespace dpctl::tensor::py_internal::atomic_support diff --git a/dpnp/tensor/libtensor/source/linalg_functions/dot_dispatch.hpp b/dpnp/tensor/libtensor/source/linalg_functions/dot_dispatch.hpp new file mode 100644 index 000000000000..984f71a4c183 --- /dev/null +++ b/dpnp/tensor/libtensor/source/linalg_functions/dot_dispatch.hpp @@ -0,0 +1,405 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===--------------------------------------------------------------------===// + +#pragma once + +#include +#include +#include + +#include "kernels/linalg_functions/dot_product.hpp" +#include "kernels/linalg_functions/gemm.hpp" +#include "utils/type_dispatch_building.hpp" + +namespace dpctl::tensor::py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +template +struct DotAtomicOutputType +{ + using value_type = typename std::disjunction< + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::DefaultResultEntry>::result_type; + + static constexpr bool is_defined = !std::is_same_v; +}; + +// add separate type support lists for atomic vs. temps +// gemm, gevm, and dot product share output type struct +template +struct DotNoAtomicOutputType +{ + using value_type = typename std::disjunction< + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + T2, + std::complex, + std::complex>, + td_ns::BinaryTypeMapResultEntry, + T2, + std::complex, + std::complex>, + td_ns::BinaryTypeMapResultEntry, + T2, + std::complex, + std::complex>, + td_ns::DefaultResultEntry>::result_type; + + static constexpr bool is_defined = !std::is_same_v; +}; + +template +struct DotTypeMapFactory +{ + /*! @brief get typeid for output type of kernels called by py_dot */ + std::enable_if_t::value, int> get() + { + using rT1 = typename DotNoAtomicOutputType::value_type; + using rT2 = typename DotAtomicOutputType::value_type; + static_assert(std::is_same_v || std::is_same_v); + return td_ns::GetTypeid{}.get(); + } +}; + +template +struct GemmBatchAtomicFactory +{ + fnT get() + { + if constexpr (!DotAtomicOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + using dpctl::tensor::kernels::gemm_batch_impl; + using T3 = typename DotAtomicOutputType::value_type; + fnT fn = gemm_batch_impl; + return fn; + } + } +}; + +template +struct GemmBatchContigAtomicFactory +{ + fnT get() + { + if constexpr (!DotAtomicOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + using dpctl::tensor::kernels::gemm_batch_contig_impl; + using T3 = typename DotAtomicOutputType::value_type; + fnT fn = gemm_batch_contig_impl; + return fn; + } + } +}; + +template +struct GemmAtomicFactory +{ + fnT get() + { + if constexpr (!DotAtomicOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + using dpctl::tensor::kernels::gemm_impl; + using T3 = typename DotAtomicOutputType::value_type; + fnT fn = gemm_impl; + return fn; + } + } +}; + +template +struct GemmContigAtomicFactory +{ + fnT get() + { + if constexpr (!DotAtomicOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + using dpctl::tensor::kernels::gemm_contig_impl; + using T3 = typename DotAtomicOutputType::value_type; + fnT fn = gemm_contig_impl; + return fn; + } + } +}; + +template +struct GemmTempsFactory +{ + fnT get() + { + if constexpr (!DotNoAtomicOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + using dpctl::tensor::kernels::gemm_tree_impl; + using T3 = typename DotNoAtomicOutputType::value_type; + fnT fn = gemm_tree_impl; + return fn; + } + } +}; + +template +struct GemmContigTempsFactory +{ + fnT get() + { + if constexpr (!DotNoAtomicOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + using dpctl::tensor::kernels::gemm_contig_tree_impl; + using T3 = typename DotNoAtomicOutputType::value_type; + fnT fn = gemm_contig_tree_impl; + return fn; + } + } +}; + +template +struct GemmBatchTempsFactory +{ + fnT get() + { + if constexpr (!DotNoAtomicOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + using dpctl::tensor::kernels::gemm_batch_tree_impl; + using T3 = typename DotNoAtomicOutputType::value_type; + fnT fn = gemm_batch_tree_impl; + return fn; + } + } +}; + +template +struct GemmBatchContigTempsFactory +{ + fnT get() + { + if constexpr (!DotNoAtomicOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + using dpctl::tensor::kernels::gemm_batch_contig_tree_impl; + using T3 = typename DotNoAtomicOutputType::value_type; + fnT fn = gemm_batch_contig_tree_impl; + return fn; + } + } +}; + +template +struct DotProductAtomicFactory +{ + fnT get() + { + if constexpr (!DotAtomicOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + using dpctl::tensor::kernels::dot_product_impl; + using T3 = typename DotAtomicOutputType::value_type; + fnT fn = dot_product_impl; + return fn; + } + } +}; + +template +struct DotProductNoAtomicFactory +{ + fnT get() + { + if constexpr (!DotNoAtomicOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + using dpctl::tensor::kernels::dot_product_tree_impl; + using T3 = typename DotNoAtomicOutputType::value_type; + fnT fn = dot_product_tree_impl; + return fn; + } + } +}; + +template +struct DotProductContigAtomicFactory +{ + fnT get() + { + if constexpr (!DotAtomicOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + using dpctl::tensor::kernels::dot_product_contig_impl; + using T3 = typename DotAtomicOutputType::value_type; + fnT fn = dot_product_contig_impl; + return fn; + } + } +}; + +template +struct DotProductContigNoAtomicFactory +{ + fnT get() + { + if constexpr (!DotNoAtomicOutputType::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + using dpctl::tensor::kernels::dot_product_contig_tree_impl; + using T3 = typename DotNoAtomicOutputType::value_type; + fnT fn = dot_product_contig_tree_impl; + return fn; + } + } +}; + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/linear_sequences.cpp b/dpnp/tensor/libtensor/source/linear_sequences.cpp new file mode 100644 index 000000000000..9a7bf2dbcc0f --- /dev/null +++ b/dpnp/tensor/libtensor/source/linear_sequences.cpp @@ -0,0 +1,306 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===---------------------------------------------------------------------===// + +#include +#include +#include + +#include + +#include "dpnp4pybind11.hpp" +#include // py::cast> +#include + +#include "kernels/constructors.hpp" +#include "utils/output_validation.hpp" +#include "utils/type_dispatch.hpp" + +#include "linear_sequences.hpp" + +namespace dpctl::tensor::py_internal +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +// Constructor to populate tensor with linear sequence defined by +// start and step data + +typedef sycl::event (*lin_space_step_fn_ptr_t)( + sycl::queue &, + std::size_t, // num_elements + const py::object &start, + const py::object &step, + char *, // dst_data_ptr + const std::vector &); + +/*! + * @brief Function to submit kernel to populate given contiguous memory + * allocation with linear sequence specified by starting value and increment + * given as Python objects. + * + * @param q Sycl queue to which the kernel is submitted + * @param nelems Length of the sequence + * @param start Starting value of the sequence as Python object. Must be + * convertible to array element data type `Ty`. + * @param step Increment of the sequence as Python object. Must be convertible + * to array element data type `Ty`. + * @param array_data Kernel accessible USM pointer to the start of array to be + * populated. + * @param depends List of events to wait for before starting computations, if + * any. + * + * @return Event to wait on to ensure that computation completes. + * @defgroup CtorKernels + */ +template +sycl::event lin_space_step_impl(sycl::queue &exec_q, + std::size_t nelems, + const py::object &start, + const py::object &step, + char *array_data, + const std::vector &depends) +{ + Ty start_v = py::cast(start); + Ty step_v = py::cast(step); + + using dpctl::tensor::kernels::constructors::lin_space_step_impl; + + auto lin_space_step_event = lin_space_step_impl( + exec_q, nelems, start_v, step_v, array_data, depends); + + return lin_space_step_event; +} + +typedef sycl::event (*lin_space_affine_fn_ptr_t)( + sycl::queue &, + std::size_t, // num_elements + const py::object &start, + const py::object &end, + bool include_endpoint, + char *, // dst_data_ptr + const std::vector &); + +/*! + * @brief Function to submit kernel to populate given contiguous memory + * allocation with linear sequence specified by starting and end values given + * as Python objects. + * + * @param exec_q Sycl queue to which kernel is submitted for execution. + * @param nelems Length of the sequence + * @param start Stating value of the sequence as Python object. Must be + * convertible to array data element type `Ty`. + * @param end End-value of the sequence as Python object. Must be convertible + * to array data element type `Ty`. + * @param include_endpoint Whether the end-value is included in the sequence + * @param array_data Kernel accessible USM pointer to the start of array to be + * populated. + * @param depends List of events to wait for before starting computations, if + * any. + * + * @return Event to wait on to ensure that computation completes. + * @defgroup CtorKernels + */ +template +sycl::event lin_space_affine_impl(sycl::queue &exec_q, + std::size_t nelems, + const py::object &start, + const py::object &end, + bool include_endpoint, + char *array_data, + const std::vector &depends) +{ + Ty start_v = py::cast(start); + Ty end_v = py::cast(end); + + using dpctl::tensor::kernels::constructors::lin_space_affine_impl; + + auto lin_space_affine_event = lin_space_affine_impl( + exec_q, nelems, start_v, end_v, include_endpoint, array_data, depends); + + return lin_space_affine_event; +} + +using dpctl::utils::keep_args_alive; + +static lin_space_step_fn_ptr_t lin_space_step_dispatch_vector[td_ns::num_types]; + +static lin_space_affine_fn_ptr_t + lin_space_affine_dispatch_vector[td_ns::num_types]; + +std::pair + usm_ndarray_linear_sequence_step(const py::object &start, + const py::object &dt, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends) +{ + // dst must be 1D and C-contiguous + // start, end should be coercible into data type of dst + + if (dst.get_ndim() != 1) { + throw py::value_error( + "usm_ndarray_linspace: Expecting 1D array to populate"); + } + + if (!dst.is_c_contiguous()) { + throw py::value_error( + "usm_ndarray_linspace: Non-contiguous arrays are not supported"); + } + + if (!dpctl::utils::queues_are_compatible(exec_q, {dst})) { + throw py::value_error( + "Execution queue is not compatible with the allocation queue"); + } + + dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst); + + auto array_types = td_ns::usm_ndarray_types(); + int dst_typenum = dst.get_typenum(); + int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum); + + py::ssize_t len = dst.get_shape(0); + if (len == 0) { + // nothing to do + return std::make_pair(sycl::event{}, sycl::event{}); + } + + char *dst_data = dst.get_data(); + sycl::event linspace_step_event; + + auto fn = lin_space_step_dispatch_vector[dst_typeid]; + + linspace_step_event = + fn(exec_q, static_cast(len), start, dt, dst_data, depends); + + return std::make_pair(keep_args_alive(exec_q, {dst}, {linspace_step_event}), + linspace_step_event); +} + +std::pair + usm_ndarray_linear_sequence_affine(const py::object &start, + const py::object &end, + const dpctl::tensor::usm_ndarray &dst, + bool include_endpoint, + sycl::queue &exec_q, + const std::vector &depends) +{ + // dst must be 1D and C-contiguous + // start, end should be coercible into data type of dst + + if (dst.get_ndim() != 1) { + throw py::value_error( + "usm_ndarray_linspace: Expecting 1D array to populate"); + } + + if (!dst.is_c_contiguous()) { + throw py::value_error( + "usm_ndarray_linspace: Non-contiguous arrays are not supported"); + } + + if (!dpctl::utils::queues_are_compatible(exec_q, {dst})) { + throw py::value_error( + "Execution queue context is not the same as allocation context"); + } + + dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst); + + auto array_types = td_ns::usm_ndarray_types(); + int dst_typenum = dst.get_typenum(); + int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum); + + py::ssize_t len = dst.get_shape(0); + if (len == 0) { + // nothing to do + return std::make_pair(sycl::event{}, sycl::event{}); + } + + char *dst_data = dst.get_data(); + sycl::event linspace_affine_event; + + auto fn = lin_space_affine_dispatch_vector[dst_typeid]; + + linspace_affine_event = fn(exec_q, static_cast(len), start, + end, include_endpoint, dst_data, depends); + + return std::make_pair( + keep_args_alive(exec_q, {dst}, {linspace_affine_event}), + linspace_affine_event); +} + +/*! + * @brief Factor to get function pointer of type `fnT` for array with elements + * of type `Ty`. + * @defgroup CtorKernels + */ +template +struct LinSpaceStepFactory +{ + fnT get() + { + fnT f = lin_space_step_impl; + return f; + } +}; + +/*! + * @brief Factory to get function pointer of type `fnT` for array data type + * `Ty`. + */ +template +struct LinSpaceAffineFactory +{ + fnT get() + { + fnT f = lin_space_affine_impl; + return f; + } +}; + +void init_linear_sequences_dispatch_vectors(void) +{ + using namespace td_ns; + + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(lin_space_step_dispatch_vector); + + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(lin_space_affine_dispatch_vector); +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/linear_sequences.hpp b/dpnp/tensor/libtensor/source/linear_sequences.hpp new file mode 100644 index 000000000000..45cf45153462 --- /dev/null +++ b/dpnp/tensor/libtensor/source/linear_sequences.hpp @@ -0,0 +1,66 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===---------------------------------------------------------------------===// + +#pragma once +#include +#include + +#include + +#include "dpnp4pybind11.hpp" +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern std::pair usm_ndarray_linear_sequence_step( + const py::object &start, + const py::object &dt, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends = {}); + +extern std::pair usm_ndarray_linear_sequence_affine( + const py::object &start, + const py::object &end, + const dpctl::tensor::usm_ndarray &dst, + bool include_endpoint, + sycl::queue &exec_q, + const std::vector &depends = {}); + +extern void init_linear_sequences_dispatch_vectors(void); + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/reductions/all.cpp b/dpnp/tensor/libtensor/source/reductions/all.cpp new file mode 100644 index 000000000000..a901b9e1d9a3 --- /dev/null +++ b/dpnp/tensor/libtensor/source/reductions/all.cpp @@ -0,0 +1,164 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_reductions_impl +/// extension. +//===---------------------------------------------------------------------===// + +#include +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include + +#include "kernels/reductions.hpp" +#include "reduction_atomic_support.hpp" +#include "reduction_over_axis.hpp" +#include "utils/type_dispatch.hpp" + +namespace dpctl::tensor::py_internal +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace impl +{ + +using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr; +static reduction_strided_impl_fn_ptr + all_reduction_strided_dispatch_vector[td_ns::num_types]; + +using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr; +static reduction_contig_impl_fn_ptr + all_reduction_axis1_contig_dispatch_vector[td_ns::num_types]; +static reduction_contig_impl_fn_ptr + all_reduction_axis0_contig_dispatch_vector[td_ns::num_types]; + +template +struct AllStridedFactory +{ + fnT get() const + { + using dstTy = std::int32_t; + using ReductionOpT = sycl::logical_and; + return dpctl::tensor::kernels:: + reduction_over_group_with_atomics_strided_impl; + } +}; + +template +struct AllAxis1ContigFactory +{ + fnT get() const + { + using dstTy = std::int32_t; + using ReductionOpT = sycl::logical_and; + return dpctl::tensor::kernels:: + reduction_axis1_over_group_with_atomics_contig_impl; + } +}; + +template +struct AllAxis0ContigFactory +{ + fnT get() const + { + using dstTy = std::int32_t; + using ReductionOpT = sycl::logical_and; + return dpctl::tensor::kernels:: + reduction_axis0_over_group_with_atomics_contig_impl; + } +}; + +void populate_all_dispatch_vectors(void) +{ + using td_ns::DispatchVectorBuilder; + + DispatchVectorBuilder + all_dvb1; + all_dvb1.populate_dispatch_vector(all_reduction_strided_dispatch_vector); + + DispatchVectorBuilder + all_dvb2; + all_dvb2.populate_dispatch_vector( + all_reduction_axis1_contig_dispatch_vector); + + DispatchVectorBuilder + all_dvb3; + all_dvb3.populate_dispatch_vector( + all_reduction_axis0_contig_dispatch_vector); +}; + +using atomic_support::atomic_support_fn_ptr_t; +using atomic_support::check_atomic_support; +static atomic_support_fn_ptr_t all_atomic_support = + check_atomic_support; + +} // namespace impl + +void init_all(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_all_dispatch_vectors(); + using impl::all_reduction_axis0_contig_dispatch_vector; + using impl::all_reduction_axis1_contig_dispatch_vector; + using impl::all_reduction_strided_dispatch_vector; + + using impl::all_atomic_support; + + auto all_pyapi = [&](const arrayT &src, int trailing_dims_to_reduce, + const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_boolean_reduction( + src, trailing_dims_to_reduce, dst, exec_q, depends, + all_reduction_axis1_contig_dispatch_vector, + all_reduction_axis0_contig_dispatch_vector, + all_reduction_strided_dispatch_vector, all_atomic_support); + }; + m.def("_all", all_pyapi, "", py::arg("src"), + py::arg("trailing_dims_to_reduce"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + } +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/reductions/all.hpp b/dpnp/tensor/libtensor/source/reductions/all.hpp new file mode 100644 index 000000000000..5fb184e37c66 --- /dev/null +++ b/dpnp/tensor/libtensor/source/reductions/all.hpp @@ -0,0 +1,46 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_reductions_impl +/// extension. +//===---------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern void init_all(py::module_ m); + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/reductions/any.cpp b/dpnp/tensor/libtensor/source/reductions/any.cpp new file mode 100644 index 000000000000..6859e46cbc4a --- /dev/null +++ b/dpnp/tensor/libtensor/source/reductions/any.cpp @@ -0,0 +1,164 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_reductions_impl +/// extension. +//===---------------------------------------------------------------------===// + +#include +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include + +#include "kernels/reductions.hpp" +#include "reduction_atomic_support.hpp" +#include "reduction_over_axis.hpp" +#include "utils/type_dispatch.hpp" + +namespace dpctl::tensor::py_internal +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace impl +{ + +using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr; +static reduction_strided_impl_fn_ptr + any_reduction_strided_dispatch_vector[td_ns::num_types]; + +using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr; +static reduction_contig_impl_fn_ptr + any_reduction_axis1_contig_dispatch_vector[td_ns::num_types]; +static reduction_contig_impl_fn_ptr + any_reduction_axis0_contig_dispatch_vector[td_ns::num_types]; + +template +struct AnyStridedFactory +{ + fnT get() const + { + using dstTy = std::int32_t; + using ReductionOpT = sycl::logical_or; + return dpctl::tensor::kernels:: + reduction_over_group_with_atomics_strided_impl; + } +}; + +template +struct AnyAxis1ContigFactory +{ + fnT get() const + { + using dstTy = std::int32_t; + using ReductionOpT = sycl::logical_or; + return dpctl::tensor::kernels:: + reduction_axis1_over_group_with_atomics_contig_impl; + } +}; + +template +struct AnyAxis0ContigFactory +{ + fnT get() const + { + using dstTy = std::int32_t; + using ReductionOpT = sycl::logical_or; + return dpctl::tensor::kernels:: + reduction_axis0_over_group_with_atomics_contig_impl; + } +}; + +void populate_any_dispatch_vectors(void) +{ + using td_ns::DispatchVectorBuilder; + + DispatchVectorBuilder + any_dvb1; + any_dvb1.populate_dispatch_vector(any_reduction_strided_dispatch_vector); + + DispatchVectorBuilder + any_dvb2; + any_dvb2.populate_dispatch_vector( + any_reduction_axis1_contig_dispatch_vector); + + DispatchVectorBuilder + any_dvb3; + any_dvb3.populate_dispatch_vector( + any_reduction_axis0_contig_dispatch_vector); +}; + +using atomic_support::atomic_support_fn_ptr_t; +using atomic_support::check_atomic_support; +static atomic_support_fn_ptr_t any_atomic_support = + check_atomic_support; + +} // namespace impl + +void init_any(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_any_dispatch_vectors(); + using impl::any_reduction_axis0_contig_dispatch_vector; + using impl::any_reduction_axis1_contig_dispatch_vector; + using impl::any_reduction_strided_dispatch_vector; + + using impl::any_atomic_support; + + auto any_pyapi = [&](const arrayT &src, int trailing_dims_to_reduce, + const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_boolean_reduction( + src, trailing_dims_to_reduce, dst, exec_q, depends, + any_reduction_axis1_contig_dispatch_vector, + any_reduction_axis0_contig_dispatch_vector, + any_reduction_strided_dispatch_vector, any_atomic_support); + }; + m.def("_any", any_pyapi, "", py::arg("src"), + py::arg("trailing_dims_to_reduce"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + } +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/reductions/any.hpp b/dpnp/tensor/libtensor/source/reductions/any.hpp new file mode 100644 index 000000000000..4e368a674615 --- /dev/null +++ b/dpnp/tensor/libtensor/source/reductions/any.hpp @@ -0,0 +1,46 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_reductions_impl +/// extension. +//===---------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern void init_any(py::module_ m); + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/reductions/argmax.cpp b/dpnp/tensor/libtensor/source/reductions/argmax.cpp new file mode 100644 index 000000000000..af602371dfc5 --- /dev/null +++ b/dpnp/tensor/libtensor/source/reductions/argmax.cpp @@ -0,0 +1,276 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_reductions_impl +/// extension. +//===---------------------------------------------------------------------===// + +#include +#include +#include +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include + +#include "kernels/reductions.hpp" +#include "reduction_over_axis.hpp" +#include "utils/sycl_utils.hpp" +#include "utils/type_dispatch_building.hpp" + +namespace dpctl::tensor::py_internal +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; +namespace su_ns = dpctl::tensor::sycl_utils; + +namespace impl +{ + +using dpctl::tensor::kernels::search_strided_impl_fn_ptr; +static search_strided_impl_fn_ptr + argmax_over_axis_strided_temps_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +using dpctl::tensor::kernels::search_contig_impl_fn_ptr; +static search_contig_impl_fn_ptr + argmax_over_axis1_contig_temps_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +using dpctl::tensor::kernels::search_contig_impl_fn_ptr; +static search_contig_impl_fn_ptr + argmax_over_axis0_contig_temps_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +template +struct TypePairSupportForArgmaxReductionTemps +{ + + static constexpr bool is_defined = std::disjunction< + td_ns::TypePairDefinedEntry, + // input int8_t + td_ns::TypePairDefinedEntry, + + // input uint8_t + td_ns::TypePairDefinedEntry, + + // input int16_t + td_ns::TypePairDefinedEntry, + + // input uint16_t + td_ns::TypePairDefinedEntry, + + // input int32_t + td_ns::TypePairDefinedEntry, + // input uint32_t + td_ns::TypePairDefinedEntry, + + // input int64_t + td_ns::TypePairDefinedEntry, + + // input uint32_t + td_ns::TypePairDefinedEntry, + + // input half + td_ns::TypePairDefinedEntry, + + // input float + td_ns::TypePairDefinedEntry, + + // input double + td_ns::TypePairDefinedEntry, + + // input std::complex + td_ns::TypePairDefinedEntry, + outTy, + std::int64_t>, + + td_ns::TypePairDefinedEntry, + outTy, + std::int64_t>, + + // fall-through + td_ns::NotDefinedEntry>::is_defined; +}; + +template +struct ArgmaxOverAxisTempsStridedFactory +{ + fnT get() const + { + if constexpr (TypePairSupportForArgmaxReductionTemps< + srcTy, dstTy>::is_defined) { + if constexpr (std::is_integral_v && + !std::is_same_v) { + // op for values + using ReductionOpT = sycl::maximum; + // op for indices + using IndexOpT = sycl::minimum; + return dpctl::tensor::kernels:: + search_over_group_temps_strided_impl< + srcTy, dstTy, ReductionOpT, IndexOpT>; + } + else { + // op for values + using ReductionOpT = su_ns::Maximum; + // op for indices + using IndexOpT = sycl::minimum; + return dpctl::tensor::kernels:: + search_over_group_temps_strided_impl< + srcTy, dstTy, ReductionOpT, IndexOpT>; + } + } + else { + return nullptr; + } + } +}; + +template +struct ArgmaxOverAxis1TempsContigFactory +{ + fnT get() const + { + if constexpr (TypePairSupportForArgmaxReductionTemps< + srcTy, dstTy>::is_defined) { + if constexpr (std::is_integral_v && + !std::is_same_v) { + // op for values + using ReductionOpT = sycl::maximum; + // op for indices + using IndexOpT = sycl::minimum; + return dpctl::tensor::kernels:: + search_axis1_over_group_temps_contig_impl< + srcTy, dstTy, ReductionOpT, IndexOpT>; + } + else { + // op for values + using ReductionOpT = su_ns::Maximum; + // op for indices + using IndexOpT = sycl::minimum; + return dpctl::tensor::kernels:: + search_axis1_over_group_temps_contig_impl< + srcTy, dstTy, ReductionOpT, IndexOpT>; + } + } + else { + return nullptr; + } + } +}; + +template +struct ArgmaxOverAxis0TempsContigFactory +{ + fnT get() const + { + if constexpr (TypePairSupportForArgmaxReductionTemps< + srcTy, dstTy>::is_defined) { + if constexpr (std::is_integral_v && + !std::is_same_v) { + // op for values + using ReductionOpT = sycl::maximum; + // op for indices + using IndexOpT = sycl::minimum; + return dpctl::tensor::kernels:: + search_axis0_over_group_temps_contig_impl< + srcTy, dstTy, ReductionOpT, IndexOpT>; + } + else { + // op for values + using ReductionOpT = su_ns::Maximum; + // op for indices + using IndexOpT = sycl::minimum; + return dpctl::tensor::kernels:: + search_axis0_over_group_temps_contig_impl< + srcTy, dstTy, ReductionOpT, IndexOpT>; + } + } + else { + return nullptr; + } + } +}; + +void populate_argmax_over_axis_dispatch_tables(void) +{ + using td_ns::DispatchTableBuilder; + + DispatchTableBuilder + dtb1; + dtb1.populate_dispatch_table(argmax_over_axis_strided_temps_dispatch_table); + + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(argmax_over_axis1_contig_temps_dispatch_table); + + DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table(argmax_over_axis0_contig_temps_dispatch_table); +} + +} // namespace impl + +void init_argmax(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + using impl::populate_argmax_over_axis_dispatch_tables; + populate_argmax_over_axis_dispatch_tables(); + using impl::argmax_over_axis0_contig_temps_dispatch_table; + using impl::argmax_over_axis1_contig_temps_dispatch_table; + using impl::argmax_over_axis_strided_temps_dispatch_table; + + auto argmax_pyapi = [&](const arrayT &src, int trailing_dims_to_reduce, + const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_search_over_axis( + src, trailing_dims_to_reduce, dst, exec_q, depends, + argmax_over_axis_strided_temps_dispatch_table, + argmax_over_axis0_contig_temps_dispatch_table, + argmax_over_axis1_contig_temps_dispatch_table); + }; + m.def("_argmax_over_axis", argmax_pyapi, "", py::arg("src"), + py::arg("trailing_dims_to_reduce"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + } +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/reductions/argmax.hpp b/dpnp/tensor/libtensor/source/reductions/argmax.hpp new file mode 100644 index 000000000000..3274f8c7d0cb --- /dev/null +++ b/dpnp/tensor/libtensor/source/reductions/argmax.hpp @@ -0,0 +1,46 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_reductions_impl +/// extension. +//===---------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern void init_argmax(py::module_ m); + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/reductions/argmin.cpp b/dpnp/tensor/libtensor/source/reductions/argmin.cpp new file mode 100644 index 000000000000..4869b75eacf9 --- /dev/null +++ b/dpnp/tensor/libtensor/source/reductions/argmin.cpp @@ -0,0 +1,276 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_reductions_impl +/// extension. +//===---------------------------------------------------------------------===// + +#include +#include +#include +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include + +#include "kernels/reductions.hpp" +#include "reduction_over_axis.hpp" +#include "utils/sycl_utils.hpp" +#include "utils/type_dispatch_building.hpp" + +namespace dpctl::tensor::py_internal +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; +namespace su_ns = dpctl::tensor::sycl_utils; + +namespace impl +{ + +using dpctl::tensor::kernels::search_strided_impl_fn_ptr; +static search_strided_impl_fn_ptr + argmin_over_axis_strided_temps_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +using dpctl::tensor::kernels::search_contig_impl_fn_ptr; +static search_contig_impl_fn_ptr + argmin_over_axis1_contig_temps_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +using dpctl::tensor::kernels::search_contig_impl_fn_ptr; +static search_contig_impl_fn_ptr + argmin_over_axis0_contig_temps_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +template +struct TypePairSupportForArgminReductionTemps +{ + + static constexpr bool is_defined = std::disjunction< + td_ns::TypePairDefinedEntry, + // input int8_t + td_ns::TypePairDefinedEntry, + + // input uint8_t + td_ns::TypePairDefinedEntry, + + // input int16_t + td_ns::TypePairDefinedEntry, + + // input uint16_t + td_ns::TypePairDefinedEntry, + + // input int32_t + td_ns::TypePairDefinedEntry, + // input uint32_t + td_ns::TypePairDefinedEntry, + + // input int64_t + td_ns::TypePairDefinedEntry, + + // input uint32_t + td_ns::TypePairDefinedEntry, + + // input half + td_ns::TypePairDefinedEntry, + + // input float + td_ns::TypePairDefinedEntry, + + // input double + td_ns::TypePairDefinedEntry, + + // input std::complex + td_ns::TypePairDefinedEntry, + outTy, + std::int64_t>, + + td_ns::TypePairDefinedEntry, + outTy, + std::int64_t>, + + // fall-through + td_ns::NotDefinedEntry>::is_defined; +}; + +template +struct ArgminOverAxisTempsStridedFactory +{ + fnT get() const + { + if constexpr (TypePairSupportForArgminReductionTemps< + srcTy, dstTy>::is_defined) { + if constexpr (std::is_integral_v && + !std::is_same_v) { + // op for values + using ReductionOpT = sycl::minimum; + // op for indices + using IndexOpT = sycl::minimum; + return dpctl::tensor::kernels:: + search_over_group_temps_strided_impl< + srcTy, dstTy, ReductionOpT, IndexOpT>; + } + else { + // op for values + using ReductionOpT = su_ns::Minimum; + // op for indices + using IndexOpT = sycl::minimum; + return dpctl::tensor::kernels:: + search_over_group_temps_strided_impl< + srcTy, dstTy, ReductionOpT, IndexOpT>; + } + } + else { + return nullptr; + } + } +}; + +template +struct ArgminOverAxis1TempsContigFactory +{ + fnT get() const + { + if constexpr (TypePairSupportForArgminReductionTemps< + srcTy, dstTy>::is_defined) { + if constexpr (std::is_integral_v && + !std::is_same_v) { + // op for values + using ReductionOpT = sycl::minimum; + // op for indices + using IndexOpT = sycl::minimum; + return dpctl::tensor::kernels:: + search_axis1_over_group_temps_contig_impl< + srcTy, dstTy, ReductionOpT, IndexOpT>; + } + else { + // op for values + using ReductionOpT = su_ns::Minimum; + // op for indices + using IndexOpT = sycl::minimum; + return dpctl::tensor::kernels:: + search_axis1_over_group_temps_contig_impl< + srcTy, dstTy, ReductionOpT, IndexOpT>; + } + } + else { + return nullptr; + } + } +}; + +template +struct ArgminOverAxis0TempsContigFactory +{ + fnT get() const + { + if constexpr (TypePairSupportForArgminReductionTemps< + srcTy, dstTy>::is_defined) { + if constexpr (std::is_integral_v && + !std::is_same_v) { + // op for values + using ReductionOpT = sycl::minimum; + // op for indices + using IndexOpT = sycl::minimum; + return dpctl::tensor::kernels:: + search_axis0_over_group_temps_contig_impl< + srcTy, dstTy, ReductionOpT, IndexOpT>; + } + else { + // op for values + using ReductionOpT = su_ns::Minimum; + // op for indices + using IndexOpT = sycl::minimum; + return dpctl::tensor::kernels:: + search_axis0_over_group_temps_contig_impl< + srcTy, dstTy, ReductionOpT, IndexOpT>; + } + } + else { + return nullptr; + } + } +}; + +void populate_argmin_over_axis_dispatch_tables(void) +{ + using td_ns::DispatchTableBuilder; + + DispatchTableBuilder + dtb1; + dtb1.populate_dispatch_table(argmin_over_axis_strided_temps_dispatch_table); + + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(argmin_over_axis1_contig_temps_dispatch_table); + + DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table(argmin_over_axis0_contig_temps_dispatch_table); +} + +} // namespace impl + +void init_argmin(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + using impl::populate_argmin_over_axis_dispatch_tables; + populate_argmin_over_axis_dispatch_tables(); + using impl::argmin_over_axis0_contig_temps_dispatch_table; + using impl::argmin_over_axis1_contig_temps_dispatch_table; + using impl::argmin_over_axis_strided_temps_dispatch_table; + + auto argmin_pyapi = [&](const arrayT &src, int trailing_dims_to_reduce, + const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_search_over_axis( + src, trailing_dims_to_reduce, dst, exec_q, depends, + argmin_over_axis_strided_temps_dispatch_table, + argmin_over_axis0_contig_temps_dispatch_table, + argmin_over_axis1_contig_temps_dispatch_table); + }; + m.def("_argmin_over_axis", argmin_pyapi, "", py::arg("src"), + py::arg("trailing_dims_to_reduce"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + } +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/reductions/argmin.hpp b/dpnp/tensor/libtensor/source/reductions/argmin.hpp new file mode 100644 index 000000000000..1865c258a527 --- /dev/null +++ b/dpnp/tensor/libtensor/source/reductions/argmin.hpp @@ -0,0 +1,46 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_reductions_impl +/// extension. +//===---------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern void init_argmin(py::module_ m); + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/reductions/logsumexp.cpp b/dpnp/tensor/libtensor/source/reductions/logsumexp.cpp new file mode 100644 index 000000000000..351eab82ee6b --- /dev/null +++ b/dpnp/tensor/libtensor/source/reductions/logsumexp.cpp @@ -0,0 +1,255 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_reductions_impl +/// extension. +//===---------------------------------------------------------------------===// + +#include +#include +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include +#include + +#include "kernels/reductions.hpp" +#include "reduction_over_axis.hpp" +#include "utils/sycl_utils.hpp" +#include "utils/type_dispatch_building.hpp" + +namespace dpctl::tensor::py_internal +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; +namespace su_ns = dpctl::tensor::sycl_utils; + +namespace impl +{ + +using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr; +static reduction_strided_impl_fn_ptr + logsumexp_over_axis_strided_temps_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr; +static reduction_contig_impl_fn_ptr + logsumexp_over_axis1_contig_temps_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static reduction_contig_impl_fn_ptr + logsumexp_over_axis0_contig_temps_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +template +struct TypePairSupportDataForLogSumExpReductionTemps +{ + + static constexpr bool is_defined = std::disjunction< +#if 1 + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input int8_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input uint8_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input int16_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input uint16_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input int32_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input uint32_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input int64_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input uint64_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + // input half + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input float + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input double + td_ns::TypePairDefinedEntry, +#endif + + // fall-through + td_ns::NotDefinedEntry>::is_defined; +}; + +template +struct LogSumExpOverAxisTempsStridedFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForLogSumExpReductionTemps< + srcTy, dstTy>::is_defined) { + using ReductionOpT = su_ns::LogSumExp; + return dpctl::tensor::kernels:: + reduction_over_group_temps_strided_impl; + } + else { + return nullptr; + } + } +}; + +template +struct LogSumExpOverAxis1TempsContigFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForLogSumExpReductionTemps< + srcTy, dstTy>::is_defined) { + using ReductionOpT = su_ns::LogSumExp; + return dpctl::tensor::kernels:: + reduction_axis1_over_group_temps_contig_impl; + } + else { + return nullptr; + } + } +}; + +template +struct LogSumExpOverAxis0TempsContigFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForLogSumExpReductionTemps< + srcTy, dstTy>::is_defined) { + using ReductionOpT = su_ns::LogSumExp; + return dpctl::tensor::kernels:: + reduction_axis0_over_group_temps_contig_impl; + } + else { + return nullptr; + } + } +}; + +void populate_logsumexp_over_axis_dispatch_tables(void) +{ + using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr; + using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr; + using namespace td_ns; + + DispatchTableBuilder + dtb1; + dtb1.populate_dispatch_table( + logsumexp_over_axis_strided_temps_dispatch_table); + + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table( + logsumexp_over_axis1_contig_temps_dispatch_table); + + DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table( + logsumexp_over_axis0_contig_temps_dispatch_table); +} + +} // namespace impl + +void init_logsumexp(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + using impl::populate_logsumexp_over_axis_dispatch_tables; + populate_logsumexp_over_axis_dispatch_tables(); + using impl::logsumexp_over_axis0_contig_temps_dispatch_table; + using impl::logsumexp_over_axis1_contig_temps_dispatch_table; + using impl::logsumexp_over_axis_strided_temps_dispatch_table; + + using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr; + using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr; + + auto logsumexp_pyapi = [&](const arrayT &src, + int trailing_dims_to_reduce, + const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_tree_reduction_over_axis( + src, trailing_dims_to_reduce, dst, exec_q, depends, + logsumexp_over_axis_strided_temps_dispatch_table, + logsumexp_over_axis0_contig_temps_dispatch_table, + logsumexp_over_axis1_contig_temps_dispatch_table); + }; + m.def("_logsumexp_over_axis", logsumexp_pyapi, "", py::arg("src"), + py::arg("trailing_dims_to_reduce"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto logsumexp_dtype_supported = [&](const py::dtype &input_dtype, + const py::dtype &output_dtype) { + return py_tree_reduction_dtype_supported( + input_dtype, output_dtype, + logsumexp_over_axis_strided_temps_dispatch_table); + }; + m.def("_logsumexp_over_axis_dtype_supported", logsumexp_dtype_supported, + "", py::arg("arg_dtype"), py::arg("out_dtype")); + } +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/reductions/logsumexp.hpp b/dpnp/tensor/libtensor/source/reductions/logsumexp.hpp new file mode 100644 index 000000000000..2e2c19877db6 --- /dev/null +++ b/dpnp/tensor/libtensor/source/reductions/logsumexp.hpp @@ -0,0 +1,46 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_reductions_impl +/// extension. +//===---------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern void init_logsumexp(py::module_ m); + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/reductions/max.cpp b/dpnp/tensor/libtensor/source/reductions/max.cpp new file mode 100644 index 000000000000..628f7cfe8606 --- /dev/null +++ b/dpnp/tensor/libtensor/source/reductions/max.cpp @@ -0,0 +1,407 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_reductions_impl +/// extension. +//===---------------------------------------------------------------------===// + +#include +#include +#include +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include + +#include "kernels/reductions.hpp" +#include "utils/sycl_utils.hpp" +#include "utils/type_dispatch_building.hpp" + +#include "reduction_atomic_support.hpp" +#include "reduction_over_axis.hpp" + +namespace dpctl::tensor::py_internal +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; +namespace su_ns = dpctl::tensor::sycl_utils; + +namespace impl +{ + +using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr; +static reduction_strided_impl_fn_ptr + max_over_axis_strided_atomic_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static reduction_strided_impl_fn_ptr + max_over_axis_strided_temps_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr; +static reduction_contig_impl_fn_ptr + max_over_axis1_contig_atomic_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static reduction_contig_impl_fn_ptr + max_over_axis0_contig_atomic_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static reduction_contig_impl_fn_ptr + max_over_axis1_contig_temps_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static reduction_contig_impl_fn_ptr + max_over_axis0_contig_temps_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +/* @brief Types supported by max reduction code based on atomic_ref */ +template +struct TypePairSupportDataForMaxReductionAtomic +{ + /* value is true if a kernel for must be instantiated, false + * otherwise */ + static constexpr bool is_defined = std::disjunction< + // input int32 + td_ns::TypePairDefinedEntry, + // input uint32 + td_ns::TypePairDefinedEntry, + // input int64 + td_ns::TypePairDefinedEntry, + // input uint64 + td_ns::TypePairDefinedEntry, + // input float + td_ns::TypePairDefinedEntry, + // input double + td_ns::TypePairDefinedEntry, + // fall-through + td_ns::NotDefinedEntry>::is_defined; +}; + +template +struct TypePairSupportDataForMaxReductionTemps +{ + static constexpr bool is_defined = std::disjunction< + // input bool + td_ns::TypePairDefinedEntry, + // input int8_t + td_ns::TypePairDefinedEntry, + // input uint8_t + td_ns::TypePairDefinedEntry, + + // input int16_t + td_ns::TypePairDefinedEntry, + // input uint16_t + td_ns::TypePairDefinedEntry, + + // input int32_t + td_ns::TypePairDefinedEntry, + // input uint32_t + td_ns::TypePairDefinedEntry, + + // input int64_t + td_ns::TypePairDefinedEntry, + + // input uint32_t + td_ns::TypePairDefinedEntry, + + // input half + td_ns::TypePairDefinedEntry, + + // input float + td_ns::TypePairDefinedEntry, + + // input double + td_ns::TypePairDefinedEntry, + + // input std::complex + td_ns::TypePairDefinedEntry, + outTy, + std::complex>, + + td_ns::TypePairDefinedEntry, + outTy, + std::complex>, + + // fall-through + td_ns::NotDefinedEntry>::is_defined; +}; + +template +struct MaxOverAxisAtomicStridedFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForMaxReductionAtomic< + srcTy, dstTy>::is_defined) { + if constexpr (std::is_floating_point::value) { + using ReductionOpT = su_ns::Maximum; + return dpctl::tensor::kernels:: + reduction_over_group_with_atomics_strided_impl< + srcTy, dstTy, ReductionOpT>; + } + else { + using ReductionOpT = sycl::maximum; + return dpctl::tensor::kernels:: + reduction_over_group_with_atomics_strided_impl< + srcTy, dstTy, ReductionOpT>; + } + } + else { + return nullptr; + } + } +}; + +template +struct MaxOverAxisTempsStridedFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForMaxReductionTemps< + srcTy, dstTy>::is_defined) { + if constexpr (std::is_integral_v && + !std::is_same_v) { + using ReductionOpT = sycl::maximum; + return dpctl::tensor::kernels:: + reduction_over_group_temps_strided_impl; + } + else { + using ReductionOpT = su_ns::Maximum; + return dpctl::tensor::kernels:: + reduction_over_group_temps_strided_impl; + } + } + else { + return nullptr; + } + } +}; + +template +struct MaxOverAxis1AtomicContigFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForMaxReductionAtomic< + srcTy, dstTy>::is_defined) { + if constexpr (std::is_floating_point::value) { + using ReductionOpT = su_ns::Maximum; + return dpctl::tensor::kernels:: + reduction_axis1_over_group_with_atomics_contig_impl< + srcTy, dstTy, ReductionOpT>; + } + else { + using ReductionOpT = sycl::maximum; + return dpctl::tensor::kernels:: + reduction_axis1_over_group_with_atomics_contig_impl< + srcTy, dstTy, ReductionOpT>; + } + } + else { + return nullptr; + } + } +}; + +template +struct MaxOverAxis0AtomicContigFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForMaxReductionAtomic< + srcTy, dstTy>::is_defined) { + if constexpr (std::is_floating_point::value) { + using ReductionOpT = su_ns::Maximum; + return dpctl::tensor::kernels:: + reduction_axis0_over_group_with_atomics_contig_impl< + srcTy, dstTy, ReductionOpT>; + } + else { + using ReductionOpT = sycl::maximum; + return dpctl::tensor::kernels:: + reduction_axis0_over_group_with_atomics_contig_impl< + srcTy, dstTy, ReductionOpT>; + } + } + else { + return nullptr; + } + } +}; + +template +struct MaxOverAxis1TempsContigFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForMaxReductionTemps< + srcTy, dstTy>::is_defined) { + if constexpr (std::is_integral_v && + !std::is_same_v) { + using ReductionOpT = sycl::maximum; + return dpctl::tensor::kernels:: + reduction_axis1_over_group_temps_contig_impl; + } + else { + using ReductionOpT = su_ns::Maximum; + return dpctl::tensor::kernels:: + reduction_axis1_over_group_temps_contig_impl; + } + } + else { + return nullptr; + } + } +}; + +template +struct MaxOverAxis0TempsContigFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForMaxReductionTemps< + srcTy, dstTy>::is_defined) { + if constexpr (std::is_integral_v && + !std::is_same_v) { + using ReductionOpT = sycl::maximum; + return dpctl::tensor::kernels:: + reduction_axis0_over_group_temps_contig_impl; + } + else { + using ReductionOpT = su_ns::Maximum; + return dpctl::tensor::kernels:: + reduction_axis0_over_group_temps_contig_impl; + } + } + else { + return nullptr; + } + } +}; + +void populate_max_over_axis_dispatch_tables(void) +{ + using td_ns::DispatchTableBuilder; + + DispatchTableBuilder + dtb1; + dtb1.populate_dispatch_table(max_over_axis_strided_atomic_dispatch_table); + + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(max_over_axis_strided_temps_dispatch_table); + + DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table(max_over_axis1_contig_atomic_dispatch_table); + + DispatchTableBuilder + dtb4; + dtb4.populate_dispatch_table(max_over_axis0_contig_atomic_dispatch_table); + + DispatchTableBuilder + dtb5; + dtb5.populate_dispatch_table(max_over_axis1_contig_temps_dispatch_table); + + DispatchTableBuilder + dtb6; + dtb6.populate_dispatch_table(max_over_axis0_contig_temps_dispatch_table); +} + +using atomic_support::atomic_support_fn_ptr_t; +static atomic_support_fn_ptr_t max_atomic_support_vector[td_ns::num_types]; + +void populate_max_atomic_support_dispatch_vector(void) +{ + using td_ns::DispatchVectorBuilder; + + using atomic_support::MaxAtomicSupportFactory; + DispatchVectorBuilder + dvb; + dvb.populate_dispatch_vector(max_atomic_support_vector); +} + +} // namespace impl + +void init_max(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + using impl::populate_max_over_axis_dispatch_tables; + populate_max_over_axis_dispatch_tables(); + using impl::max_over_axis0_contig_atomic_dispatch_table; + using impl::max_over_axis0_contig_temps_dispatch_table; + using impl::max_over_axis1_contig_atomic_dispatch_table; + using impl::max_over_axis1_contig_temps_dispatch_table; + using impl::max_over_axis_strided_atomic_dispatch_table; + using impl::max_over_axis_strided_temps_dispatch_table; + + using impl::populate_max_atomic_support_dispatch_vector; + populate_max_atomic_support_dispatch_vector(); + using impl::max_atomic_support_vector; + + auto max_pyapi = [&](const arrayT &src, int trailing_dims_to_reduce, + const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_reduction_over_axis( + src, trailing_dims_to_reduce, dst, exec_q, depends, + max_over_axis_strided_atomic_dispatch_table, + max_over_axis0_contig_atomic_dispatch_table, + max_over_axis1_contig_atomic_dispatch_table, + max_over_axis_strided_temps_dispatch_table, + max_over_axis0_contig_temps_dispatch_table, + max_over_axis1_contig_temps_dispatch_table, + max_atomic_support_vector); + }; + m.def("_max_over_axis", max_pyapi, "", py::arg("src"), + py::arg("trailing_dims_to_reduce"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + } +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/reductions/max.hpp b/dpnp/tensor/libtensor/source/reductions/max.hpp new file mode 100644 index 000000000000..bc242dc8d74b --- /dev/null +++ b/dpnp/tensor/libtensor/source/reductions/max.hpp @@ -0,0 +1,46 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_reductions_impl +/// extension. +//===---------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern void init_max(py::module_ m); + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/reductions/min.cpp b/dpnp/tensor/libtensor/source/reductions/min.cpp new file mode 100644 index 000000000000..68bfdb583b0b --- /dev/null +++ b/dpnp/tensor/libtensor/source/reductions/min.cpp @@ -0,0 +1,409 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_reductions_impl +/// extension. +//===---------------------------------------------------------------------===// + +#include +#include +#include +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include + +#include "kernels/reductions.hpp" +#include "utils/sycl_utils.hpp" +#include "utils/type_dispatch_building.hpp" + +#include "reduction_atomic_support.hpp" +#include "reduction_over_axis.hpp" + +namespace dpctl::tensor::py_internal +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; +namespace su_ns = dpctl::tensor::sycl_utils; + +namespace impl +{ + +using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr; +static reduction_strided_impl_fn_ptr + min_over_axis_strided_atomic_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static reduction_strided_impl_fn_ptr + min_over_axis_strided_temps_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr; +static reduction_contig_impl_fn_ptr + min_over_axis1_contig_atomic_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static reduction_contig_impl_fn_ptr + min_over_axis0_contig_atomic_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static reduction_contig_impl_fn_ptr + min_over_axis1_contig_temps_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static reduction_contig_impl_fn_ptr + min_over_axis0_contig_temps_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +/* @brief Types supported by min reduction code based on atomic_ref */ +template +struct TypePairSupportDataForMinReductionAtomic +{ + /* value is true if a kernel for must be instantiated, false + * otherwise */ + static constexpr bool is_defined = std::disjunction< + // input int32 + td_ns::TypePairDefinedEntry, + // input uint32 + td_ns::TypePairDefinedEntry, + // input int64 + td_ns::TypePairDefinedEntry, + // input uint64 + td_ns::TypePairDefinedEntry, + // input float + td_ns::TypePairDefinedEntry, + // input double + td_ns::TypePairDefinedEntry, + // fall-through + td_ns::NotDefinedEntry>::is_defined; +}; + +template +struct TypePairSupportDataForMinReductionTemps +{ + static constexpr bool is_defined = std::disjunction< + // input bool + td_ns::TypePairDefinedEntry, + // input int8_t + td_ns::TypePairDefinedEntry, + // input uint8_t + td_ns::TypePairDefinedEntry, + + // input int16_t + td_ns::TypePairDefinedEntry, + // input uint16_t + td_ns::TypePairDefinedEntry, + + // input int32_t + td_ns::TypePairDefinedEntry, + // input uint32_t + td_ns::TypePairDefinedEntry, + + // input int64_t + td_ns::TypePairDefinedEntry, + + // input uint32_t + td_ns::TypePairDefinedEntry, + + // input half + td_ns::TypePairDefinedEntry, + + // input float + td_ns::TypePairDefinedEntry, + + // input double + td_ns::TypePairDefinedEntry, + + // input std::complex + td_ns::TypePairDefinedEntry, + outTy, + std::complex>, + + td_ns::TypePairDefinedEntry, + outTy, + std::complex>, + + // fall-through + td_ns::NotDefinedEntry>::is_defined; +}; + +template +struct MinOverAxisAtomicStridedFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForMinReductionAtomic< + srcTy, dstTy>::is_defined) { + if constexpr (std::is_floating_point::value) { + using ReductionOpT = su_ns::Minimum; + return dpctl::tensor::kernels:: + reduction_over_group_with_atomics_strided_impl< + srcTy, dstTy, ReductionOpT>; + } + else { + using ReductionOpT = sycl::minimum; + return dpctl::tensor::kernels:: + reduction_over_group_with_atomics_strided_impl< + srcTy, dstTy, ReductionOpT>; + } + } + else { + return nullptr; + } + } +}; + +template +struct MinOverAxisTempsStridedFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForMinReductionTemps< + srcTy, dstTy>::is_defined) { + if constexpr (std::is_integral_v && + !std::is_same_v) { + using ReductionOpT = sycl::minimum; + return dpctl::tensor::kernels:: + reduction_over_group_temps_strided_impl; + } + else { + using ReductionOpT = su_ns::Minimum; + return dpctl::tensor::kernels:: + reduction_over_group_temps_strided_impl; + } + } + else { + return nullptr; + } + } +}; + +template +struct MinOverAxis1AtomicContigFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForMinReductionAtomic< + srcTy, dstTy>::is_defined) { + if constexpr (std::is_floating_point::value) { + using ReductionOpT = su_ns::Minimum; + return dpctl::tensor::kernels:: + reduction_axis1_over_group_with_atomics_contig_impl< + srcTy, dstTy, ReductionOpT>; + } + else { + using ReductionOpT = sycl::minimum; + return dpctl::tensor::kernels:: + reduction_axis1_over_group_with_atomics_contig_impl< + srcTy, dstTy, ReductionOpT>; + } + } + else { + return nullptr; + } + } +}; + +template +struct MinOverAxis0AtomicContigFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForMinReductionAtomic< + srcTy, dstTy>::is_defined) { + if constexpr (std::is_floating_point::value) { + using ReductionOpT = su_ns::Minimum; + return dpctl::tensor::kernels:: + reduction_axis0_over_group_with_atomics_contig_impl< + srcTy, dstTy, ReductionOpT>; + } + else { + using ReductionOpT = sycl::minimum; + return dpctl::tensor::kernels:: + reduction_axis0_over_group_with_atomics_contig_impl< + srcTy, dstTy, ReductionOpT>; + } + } + else { + return nullptr; + } + } +}; + +template +struct MinOverAxis1TempsContigFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForMinReductionTemps< + srcTy, dstTy>::is_defined) { + if constexpr (std::is_integral_v && + !std::is_same_v) { + using ReductionOpT = sycl::minimum; + return dpctl::tensor::kernels:: + reduction_axis1_over_group_temps_contig_impl; + } + else { + using ReductionOpT = su_ns::Minimum; + return dpctl::tensor::kernels:: + reduction_axis1_over_group_temps_contig_impl; + } + } + else { + return nullptr; + } + } +}; + +template +struct MinOverAxis0TempsContigFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForMinReductionTemps< + srcTy, dstTy>::is_defined) { + if constexpr (std::is_integral_v && + !std::is_same_v) { + using ReductionOpT = sycl::minimum; + return dpctl::tensor::kernels:: + reduction_axis0_over_group_temps_contig_impl; + } + else { + using ReductionOpT = su_ns::Minimum; + return dpctl::tensor::kernels:: + reduction_axis0_over_group_temps_contig_impl; + } + } + else { + return nullptr; + } + } +}; + +void populate_min_over_axis_dispatch_tables(void) +{ + using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr; + using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr; + using td_ns::DispatchTableBuilder; + + DispatchTableBuilder + dtb1; + dtb1.populate_dispatch_table(min_over_axis_strided_atomic_dispatch_table); + + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(min_over_axis_strided_temps_dispatch_table); + + DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table(min_over_axis1_contig_atomic_dispatch_table); + + DispatchTableBuilder + dtb4; + dtb4.populate_dispatch_table(min_over_axis0_contig_atomic_dispatch_table); + + DispatchTableBuilder + dtb5; + dtb5.populate_dispatch_table(min_over_axis1_contig_temps_dispatch_table); + + DispatchTableBuilder + dtb6; + dtb6.populate_dispatch_table(min_over_axis0_contig_temps_dispatch_table); +} + +using atomic_support::atomic_support_fn_ptr_t; +static atomic_support_fn_ptr_t min_atomic_support_vector[td_ns::num_types]; + +void populate_min_atomic_support_dispatch_vector(void) +{ + using td_ns::DispatchVectorBuilder; + + using atomic_support::MinAtomicSupportFactory; + DispatchVectorBuilder + dvb; + dvb.populate_dispatch_vector(min_atomic_support_vector); +} + +} // namespace impl + +void init_min(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + using impl::populate_min_over_axis_dispatch_tables; + populate_min_over_axis_dispatch_tables(); + using impl::min_over_axis0_contig_atomic_dispatch_table; + using impl::min_over_axis0_contig_temps_dispatch_table; + using impl::min_over_axis1_contig_atomic_dispatch_table; + using impl::min_over_axis1_contig_temps_dispatch_table; + using impl::min_over_axis_strided_atomic_dispatch_table; + using impl::min_over_axis_strided_temps_dispatch_table; + + using impl::populate_min_atomic_support_dispatch_vector; + populate_min_atomic_support_dispatch_vector(); + using impl::min_atomic_support_vector; + + auto min_pyapi = [&](const arrayT &src, int trailing_dims_to_reduce, + const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_reduction_over_axis( + src, trailing_dims_to_reduce, dst, exec_q, depends, + min_over_axis_strided_atomic_dispatch_table, + min_over_axis0_contig_atomic_dispatch_table, + min_over_axis1_contig_atomic_dispatch_table, + min_over_axis_strided_temps_dispatch_table, + min_over_axis0_contig_temps_dispatch_table, + min_over_axis1_contig_temps_dispatch_table, + min_atomic_support_vector); + }; + m.def("_min_over_axis", min_pyapi, "", py::arg("src"), + py::arg("trailing_dims_to_reduce"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + } +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/reductions/min.hpp b/dpnp/tensor/libtensor/source/reductions/min.hpp new file mode 100644 index 000000000000..e054f44539f3 --- /dev/null +++ b/dpnp/tensor/libtensor/source/reductions/min.hpp @@ -0,0 +1,46 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_reductions_impl +/// extension. +//===---------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern void init_min(py::module_ m); + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/reductions/prod.cpp b/dpnp/tensor/libtensor/source/reductions/prod.cpp new file mode 100644 index 000000000000..9ecd403159b0 --- /dev/null +++ b/dpnp/tensor/libtensor/source/reductions/prod.cpp @@ -0,0 +1,460 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_reductions_impl +/// extension. +//===---------------------------------------------------------------------===// + +#include +#include +#include +#include +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include +#include + +#include "kernels/reductions.hpp" +#include "utils/type_dispatch_building.hpp" + +#include "reduction_atomic_support.hpp" +#include "reduction_over_axis.hpp" + +namespace dpctl::tensor::py_internal +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace impl +{ + +using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr; +static reduction_strided_impl_fn_ptr + prod_over_axis_strided_atomic_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static reduction_strided_impl_fn_ptr + prod_over_axis_strided_temps_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr; +static reduction_contig_impl_fn_ptr + prod_over_axis1_contig_atomic_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static reduction_contig_impl_fn_ptr + prod_over_axis0_contig_atomic_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static reduction_contig_impl_fn_ptr + prod_over_axis1_contig_temps_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static reduction_contig_impl_fn_ptr + prod_over_axis0_contig_temps_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +/* @brief Types supported by plus-reduction code based on atomic_ref */ +template +struct TypePairSupportDataForProductReductionAtomic +{ + + /* value if true a kernel for must be instantiated, false + * otherwise */ + static constexpr bool is_defined = std::disjunction< + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + // input int8 + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + // input uint8 + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + // input int16 + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + // input uint16 + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + // input int32 + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + // input uint32 + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + // input int64 + td_ns::TypePairDefinedEntry, + // input uint64 + td_ns::TypePairDefinedEntry, + // fall-through + td_ns::NotDefinedEntry>::is_defined; +}; + +template +struct TypePairSupportDataForProductReductionTemps +{ + + static constexpr bool is_defined = std::disjunction< + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input int8_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input uint8_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input int16_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input uint16_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input int32_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input uint32_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input int64_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input uint32_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input half + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns:: + TypePairDefinedEntry>, + td_ns::TypePairDefinedEntry>, + + // input float + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry>, + td_ns::TypePairDefinedEntry>, + + // input double + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry>, + + // input std::complex + td_ns::TypePairDefinedEntry, + outTy, + std::complex>, + td_ns::TypePairDefinedEntry, + outTy, + std::complex>, + + td_ns::TypePairDefinedEntry, + outTy, + std::complex>, + + // fall-through + td_ns::NotDefinedEntry>::is_defined; +}; + +template +struct ProductOverAxisAtomicStridedFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForProductReductionAtomic< + srcTy, dstTy>::is_defined) { + using ReductionOpT = sycl::multiplies; + return dpctl::tensor::kernels:: + reduction_over_group_with_atomics_strided_impl; + } + else { + return nullptr; + } + } +}; + +template +struct ProductOverAxisTempsStridedFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForProductReductionTemps< + srcTy, dstTy>::is_defined) { + using ReductionOpT = std::conditional_t, + sycl::logical_and, + sycl::multiplies>; + return dpctl::tensor::kernels:: + reduction_over_group_temps_strided_impl; + } + else { + return nullptr; + } + } +}; + +template +struct ProductOverAxis1AtomicContigFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForProductReductionAtomic< + srcTy, dstTy>::is_defined) { + using ReductionOpT = sycl::multiplies; + return dpctl::tensor::kernels:: + reduction_axis1_over_group_with_atomics_contig_impl< + srcTy, dstTy, ReductionOpT>; + } + else { + return nullptr; + } + } +}; + +template +struct ProductOverAxis0AtomicContigFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForProductReductionAtomic< + srcTy, dstTy>::is_defined) { + using ReductionOpT = sycl::multiplies; + return dpctl::tensor::kernels:: + reduction_axis0_over_group_with_atomics_contig_impl< + srcTy, dstTy, ReductionOpT>; + } + else { + return nullptr; + } + } +}; + +template +struct ProductOverAxis1TempsContigFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForProductReductionTemps< + srcTy, dstTy>::is_defined) { + using ReductionOpT = std::conditional_t, + sycl::logical_and, + sycl::multiplies>; + return dpctl::tensor::kernels:: + reduction_axis1_over_group_temps_contig_impl; + } + else { + return nullptr; + } + } +}; + +template +struct ProductOverAxis0TempsContigFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForProductReductionTemps< + srcTy, dstTy>::is_defined) { + using ReductionOpT = std::conditional_t, + sycl::logical_and, + sycl::multiplies>; + return dpctl::tensor::kernels:: + reduction_axis0_over_group_temps_contig_impl; + } + else { + return nullptr; + } + } +}; + +void populate_prod_over_axis_dispatch_tables(void) +{ + using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr; + using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr; + using namespace td_ns; + + DispatchTableBuilder + dtb1; + dtb1.populate_dispatch_table(prod_over_axis_strided_atomic_dispatch_table); + + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(prod_over_axis_strided_temps_dispatch_table); + + DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table(prod_over_axis1_contig_atomic_dispatch_table); + + DispatchTableBuilder + dtb4; + dtb4.populate_dispatch_table(prod_over_axis0_contig_atomic_dispatch_table); + + DispatchTableBuilder + dtb5; + dtb5.populate_dispatch_table(prod_over_axis1_contig_temps_dispatch_table); + + DispatchTableBuilder + dtb6; + dtb6.populate_dispatch_table(prod_over_axis0_contig_temps_dispatch_table); +} + +using atomic_support::atomic_support_fn_ptr_t; +static atomic_support_fn_ptr_t prod_atomic_support_vector[td_ns::num_types]; + +void populate_prod_atomic_support_dispatch_vector(void) +{ + using td_ns::DispatchVectorBuilder; + + using atomic_support::ProductAtomicSupportFactory; + DispatchVectorBuilder + dvb; + dvb.populate_dispatch_vector(prod_atomic_support_vector); +} + +} // namespace impl + +void init_prod(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + using impl::populate_prod_over_axis_dispatch_tables; + populate_prod_over_axis_dispatch_tables(); + using impl::prod_over_axis0_contig_atomic_dispatch_table; + using impl::prod_over_axis0_contig_temps_dispatch_table; + using impl::prod_over_axis1_contig_atomic_dispatch_table; + using impl::prod_over_axis1_contig_temps_dispatch_table; + using impl::prod_over_axis_strided_atomic_dispatch_table; + using impl::prod_over_axis_strided_temps_dispatch_table; + + using impl::populate_prod_atomic_support_dispatch_vector; + populate_prod_atomic_support_dispatch_vector(); + using impl::prod_atomic_support_vector; + + auto prod_pyapi = [&](const arrayT &src, int trailing_dims_to_reduce, + const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_reduction_over_axis( + src, trailing_dims_to_reduce, dst, exec_q, depends, + prod_over_axis_strided_atomic_dispatch_table, + prod_over_axis0_contig_atomic_dispatch_table, + prod_over_axis1_contig_atomic_dispatch_table, + prod_over_axis_strided_temps_dispatch_table, + prod_over_axis0_contig_temps_dispatch_table, + prod_over_axis1_contig_temps_dispatch_table, + prod_atomic_support_vector); + }; + m.def("_prod_over_axis", prod_pyapi, "", py::arg("src"), + py::arg("trailing_dims_to_reduce"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto prod_dtype_supported = + [&](const py::dtype &input_dtype, const py::dtype &output_dtype, + const std::string &dst_usm_type, sycl::queue &q) { + return py_reduction_dtype_supported( + input_dtype, output_dtype, dst_usm_type, q, + prod_over_axis_strided_atomic_dispatch_table, + prod_over_axis_strided_temps_dispatch_table, + prod_atomic_support_vector); + }; + m.def("_prod_over_axis_dtype_supported", prod_dtype_supported, "", + py::arg("arg_dtype"), py::arg("out_dtype"), + py::arg("dst_usm_type"), py::arg("sycl_queue")); + } +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/reductions/prod.hpp b/dpnp/tensor/libtensor/source/reductions/prod.hpp new file mode 100644 index 000000000000..15b1c07e5ddd --- /dev/null +++ b/dpnp/tensor/libtensor/source/reductions/prod.hpp @@ -0,0 +1,46 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_reductions_impl +/// extension. +//===---------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern void init_prod(py::module_ m); + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/reductions/reduce_hypot.cpp b/dpnp/tensor/libtensor/source/reductions/reduce_hypot.cpp new file mode 100644 index 000000000000..b8a042e9a55b --- /dev/null +++ b/dpnp/tensor/libtensor/source/reductions/reduce_hypot.cpp @@ -0,0 +1,251 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_reductions_impl +/// extension. +//===---------------------------------------------------------------------===// + +#include +#include +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include +#include + +#include "kernels/reductions.hpp" +#include "utils/sycl_utils.hpp" +#include "utils/type_dispatch_building.hpp" + +#include "reduction_over_axis.hpp" + +namespace dpctl::tensor::py_internal +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; +namespace su_ns = dpctl::tensor::sycl_utils; + +namespace impl +{ + +using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr; +static reduction_strided_impl_fn_ptr + hypot_over_axis_strided_temps_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr; +static reduction_contig_impl_fn_ptr + hypot_over_axis1_contig_temps_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static reduction_contig_impl_fn_ptr + hypot_over_axis0_contig_temps_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +template +struct TypePairSupportDataForHypotReductionTemps +{ + + static constexpr bool is_defined = std::disjunction< + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input int8_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input uint8_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input int16_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input uint16_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input int32_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input uint32_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input int64_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input uint64_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input half + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input float + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input double + td_ns::TypePairDefinedEntry, + + // fall-through + td_ns::NotDefinedEntry>::is_defined; +}; + +template +struct HypotOverAxisTempsStridedFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForHypotReductionTemps< + srcTy, dstTy>::is_defined) { + using ReductionOpT = su_ns::Hypot; + return dpctl::tensor::kernels:: + reduction_over_group_temps_strided_impl; + } + else { + return nullptr; + } + } +}; + +template +struct HypotOverAxis1TempsContigFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForHypotReductionTemps< + srcTy, dstTy>::is_defined) { + using ReductionOpT = su_ns::Hypot; + return dpctl::tensor::kernels:: + reduction_axis1_over_group_temps_contig_impl; + } + else { + return nullptr; + } + } +}; + +template +struct HypotOverAxis0TempsContigFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForHypotReductionTemps< + srcTy, dstTy>::is_defined) { + using ReductionOpT = su_ns::Hypot; + return dpctl::tensor::kernels:: + reduction_axis0_over_group_temps_contig_impl; + } + else { + return nullptr; + } + } +}; + +void populate_hypot_over_axis_dispatch_tables(void) +{ + using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr; + using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr; + using namespace td_ns; + + DispatchTableBuilder + dtb1; + dtb1.populate_dispatch_table(hypot_over_axis_strided_temps_dispatch_table); + + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(hypot_over_axis1_contig_temps_dispatch_table); + + DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table(hypot_over_axis0_contig_temps_dispatch_table); +} + +} // namespace impl + +void init_reduce_hypot(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + using impl::populate_hypot_over_axis_dispatch_tables; + populate_hypot_over_axis_dispatch_tables(); + using impl::hypot_over_axis0_contig_temps_dispatch_table; + using impl::hypot_over_axis1_contig_temps_dispatch_table; + using impl::hypot_over_axis_strided_temps_dispatch_table; + + using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr; + using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr; + + auto hypot_pyapi = [&](const arrayT &src, int trailing_dims_to_reduce, + const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_tree_reduction_over_axis( + src, trailing_dims_to_reduce, dst, exec_q, depends, + hypot_over_axis_strided_temps_dispatch_table, + hypot_over_axis0_contig_temps_dispatch_table, + hypot_over_axis1_contig_temps_dispatch_table); + }; + m.def("_hypot_over_axis", hypot_pyapi, "", py::arg("src"), + py::arg("trailing_dims_to_reduce"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto hypot_dtype_supported = [&](const py::dtype &input_dtype, + const py::dtype &output_dtype) { + return py_tree_reduction_dtype_supported( + input_dtype, output_dtype, + hypot_over_axis_strided_temps_dispatch_table); + }; + m.def("_hypot_over_axis_dtype_supported", hypot_dtype_supported, "", + py::arg("arg_dtype"), py::arg("out_dtype")); + } +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/reductions/reduce_hypot.hpp b/dpnp/tensor/libtensor/source/reductions/reduce_hypot.hpp new file mode 100644 index 000000000000..c0a16345af75 --- /dev/null +++ b/dpnp/tensor/libtensor/source/reductions/reduce_hypot.hpp @@ -0,0 +1,46 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_reductions_impl +/// extension. +//===---------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern void init_reduce_hypot(py::module_ m); + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/reductions/reduction_atomic_support.hpp b/dpnp/tensor/libtensor/source/reductions/reduction_atomic_support.hpp new file mode 100644 index 000000000000..af6c3f0d513a --- /dev/null +++ b/dpnp/tensor/libtensor/source/reductions/reduction_atomic_support.hpp @@ -0,0 +1,143 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_reductions_impl +/// extension. +//===---------------------------------------------------------------------===// + +#pragma once +#include + +#include + +#include "utils/type_utils.hpp" + +namespace dpctl::tensor::py_internal::atomic_support +{ + +typedef bool (*atomic_support_fn_ptr_t)(const sycl::queue &, sycl::usm::alloc); + +/*! @brief Function which returns a constant value for atomic support */ +template +bool fixed_decision(const sycl::queue &, sycl::usm::alloc) +{ + return return_value; +} + +/*! @brief Template for querying atomic support for a type on a device */ +template +bool check_atomic_support(const sycl::queue &exec_q, + sycl::usm::alloc usm_alloc_type) +{ + static constexpr bool atomic32 = (sizeof(T) == 4); + static constexpr bool atomic64 = (sizeof(T) == 8); + using dpctl::tensor::type_utils::is_complex; + if constexpr ((!atomic32 && !atomic64) || is_complex::value) { + return fixed_decision(exec_q, usm_alloc_type); + } + else { + bool supports_atomics = false; + const sycl::device &dev = exec_q.get_device(); + if constexpr (atomic64) { + if (!dev.has(sycl::aspect::atomic64)) { + return false; + } + } + switch (usm_alloc_type) { + case sycl::usm::alloc::shared: + supports_atomics = + dev.has(sycl::aspect::usm_atomic_shared_allocations); + break; + case sycl::usm::alloc::host: + supports_atomics = + dev.has(sycl::aspect::usm_atomic_host_allocations); + break; + case sycl::usm::alloc::device: + supports_atomics = true; + break; + default: + supports_atomics = false; + } + return supports_atomics; + } +} + +template +struct ArithmeticAtomicSupportFactory +{ + fnT get() + { + using dpctl::tensor::type_utils::is_complex; + if constexpr (std::is_floating_point_v || + std::is_same_v || is_complex::value) { + // for real- and complex- floating point types, tree reduction has + // better round-off accumulation properties (round-off error is + // proportional to the log2(reduction_size), while naive elementwise + // summation used by atomic implementation has round-off error + // growing proportional to the reduction_size.), hence reduction + // over floating point types should always use tree_reduction + // algorithm, even though atomic implementation may be applicable + return fixed_decision; + } + else { + return check_atomic_support; + } + } +}; + +template +struct MinMaxAtomicSupportFactory +{ + fnT get() { return check_atomic_support; } +}; + +template +struct MaxAtomicSupportFactory : public MinMaxAtomicSupportFactory +{ +}; + +template +struct MinAtomicSupportFactory : public MinMaxAtomicSupportFactory +{ +}; + +template +struct SumAtomicSupportFactory : public ArithmeticAtomicSupportFactory +{ +}; + +template +struct ProductAtomicSupportFactory + : public ArithmeticAtomicSupportFactory +{ +}; + +} // namespace dpctl::tensor::py_internal::atomic_support diff --git a/dpnp/tensor/libtensor/source/reductions/reduction_common.cpp b/dpnp/tensor/libtensor/source/reductions/reduction_common.cpp new file mode 100644 index 000000000000..fca5e09e2fe5 --- /dev/null +++ b/dpnp/tensor/libtensor/source/reductions/reduction_common.cpp @@ -0,0 +1,69 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_reductions_impl +/// extension. +//===---------------------------------------------------------------------===// + +#include + +#include "all.hpp" +#include "any.hpp" +#include "argmax.hpp" +#include "argmin.hpp" +#include "logsumexp.hpp" +#include "max.hpp" +#include "min.hpp" +#include "prod.hpp" +#include "reduce_hypot.hpp" +#include "sum.hpp" + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +/*! @brief Add reduction functions to Python module */ +void init_reduction_functions(py::module_ m) +{ + init_all(m); + init_any(m); + init_argmax(m); + init_argmin(m); + init_logsumexp(m); + init_max(m); + init_min(m); + init_prod(m); + init_reduce_hypot(m); + init_sum(m); +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/reductions/reduction_common.hpp b/dpnp/tensor/libtensor/source/reductions/reduction_common.hpp new file mode 100644 index 000000000000..4df67c16bc4e --- /dev/null +++ b/dpnp/tensor/libtensor/source/reductions/reduction_common.hpp @@ -0,0 +1,46 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_reductions_impl +/// extension. +//===---------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern void init_reduction_functions(py::module_); + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/reductions/reduction_over_axis.hpp b/dpnp/tensor/libtensor/source/reductions/reduction_over_axis.hpp new file mode 100644 index 000000000000..8224163ccb19 --- /dev/null +++ b/dpnp/tensor/libtensor/source/reductions/reduction_over_axis.hpp @@ -0,0 +1,1307 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_reductions_impl +/// extension, specifically functions for reductions. +//===---------------------------------------------------------------------===// + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include + +#include "kernels/reductions.hpp" +#include "simplify_iteration_space.hpp" +#include "utils/memory_overlap.hpp" +#include "utils/offset_utils.hpp" +#include "utils/output_validation.hpp" +#include "utils/sycl_alloc_utils.hpp" +#include "utils/type_dispatch.hpp" + +namespace dpctl::tensor::py_internal +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +/* ====================== dtype supported ======================== */ + +/*! @brief Template implementing Python API for querying type support by + * reduction which may support atomics */ +template +bool py_reduction_dtype_supported( + const py::dtype &input_dtype, + const py::dtype &output_dtype, + const std::string &dst_usm_type, + sycl::queue &q, + const fnT &atomic_dispatch_table, + const fnT &temps_dispatch_table, + const CheckAtomicSupportFnT &check_atomic_support) +{ + int arg_tn = + input_dtype.num(); // NumPy type numbers are the same as in dpctl + int out_tn = + output_dtype.num(); // NumPy type numbers are the same as in dpctl + int arg_typeid = -1; + int out_typeid = -1; + + auto array_types = td_ns::usm_ndarray_types(); + + try { + arg_typeid = array_types.typenum_to_lookup_id(arg_tn); + out_typeid = array_types.typenum_to_lookup_id(out_tn); + } catch (const std::exception &e) { + throw py::value_error(e.what()); + } + + if (arg_typeid < 0 || arg_typeid >= td_ns::num_types || out_typeid < 0 || + out_typeid >= td_ns::num_types) { + throw std::runtime_error("Reduction type support check: lookup failed"); + } + + // remove_all_extents gets underlying type of table + using fn_ptrT = typename std::remove_all_extents::type; + fn_ptrT fn = nullptr; + + sycl::usm::alloc kind = sycl::usm::alloc::unknown; + + if (dst_usm_type == "device") { + kind = sycl::usm::alloc::device; + } + else if (dst_usm_type == "shared") { + kind = sycl::usm::alloc::shared; + } + else if (dst_usm_type == "host") { + kind = sycl::usm::alloc::host; + } + else { + throw py::value_error("Unrecognized `dst_usm_type` argument."); + } + + bool supports_atomics = check_atomic_support[out_typeid](q, kind); + + if (supports_atomics) { + fn = atomic_dispatch_table[arg_typeid][out_typeid]; + } + + if (fn == nullptr) { + // use slower reduction implementation using temporaries + fn = temps_dispatch_table[arg_typeid][out_typeid]; + } + + return (fn != nullptr); +} + +/*! @brief Template implementing Python API for querying type support by tree + * reduction */ +template +bool py_tree_reduction_dtype_supported(const py::dtype &input_dtype, + const py::dtype &output_dtype, + const fnT &temps_dispatch_table) +{ + int arg_tn = + input_dtype.num(); // NumPy type numbers are the same as in dpctl + int out_tn = + output_dtype.num(); // NumPy type numbers are the same as in dpctl + int arg_typeid = -1; + int out_typeid = -1; + + auto array_types = td_ns::usm_ndarray_types(); + + try { + arg_typeid = array_types.typenum_to_lookup_id(arg_tn); + out_typeid = array_types.typenum_to_lookup_id(out_tn); + } catch (const std::exception &e) { + throw py::value_error(e.what()); + } + + if (arg_typeid < 0 || arg_typeid >= td_ns::num_types || out_typeid < 0 || + out_typeid >= td_ns::num_types) { + throw std::runtime_error("Reduction type support check: lookup failed"); + } + + auto fn = temps_dispatch_table[arg_typeid][out_typeid]; + + return (fn != nullptr); +} + +/* ==================== Generic reductions ====================== */ + +/*! @brief Template implementing Python API for reduction over axis which may + * support atomics */ +template +std::pair py_reduction_over_axis( + const dpctl::tensor::usm_ndarray &src, + int trailing_dims_to_reduce, // comp over this many trailing indexes + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends, + const strided_fnT &atomic_dispatch_table, + const contig_fnT &axis0_atomic_dispatch_table, + const contig_fnT &axis1_atomic_dispatch_table, + const strided_fnT &temps_dispatch_table, + const contig_fnT &axis0_temps_dispatch_table, + const contig_fnT &axis1_temps_dispatch_table, + const SupportAtomicFnT &check_atomic_support) +{ + int src_nd = src.get_ndim(); + int iteration_nd = src_nd - trailing_dims_to_reduce; + if (trailing_dims_to_reduce <= 0 || iteration_nd < 0) { + throw py::value_error("Trailing_dim_to_reduce must be positive, but no " + "greater than rank of the array being reduced"); + } + + int dst_nd = dst.get_ndim(); + if (dst_nd != iteration_nd) { + throw py::value_error("Destination array rank does not match input " + "array rank and number of reduced dimensions"); + } + + const py::ssize_t *src_shape_ptr = src.get_shape_raw(); + const py::ssize_t *dst_shape_ptr = dst.get_shape_raw(); + + bool same_shapes = true; + for (int i = 0; same_shapes && (i < dst_nd); ++i) { + same_shapes = same_shapes && (src_shape_ptr[i] == dst_shape_ptr[i]); + } + + if (!same_shapes) { + throw py::value_error("Destination shape does not match unreduced " + "dimensions of the input shape"); + } + + if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst); + + std::size_t dst_nelems = dst.get_size(); + + if (dst_nelems == 0) { + return std::make_pair(sycl::event(), sycl::event()); + } + + std::size_t reduction_nelems(1); + for (int i = dst_nd; i < src_nd; ++i) { + reduction_nelems *= static_cast(src_shape_ptr[i]); + } + + // check that dst and src do not overlap + auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); + if (overlap(src, dst)) { + throw py::value_error("Arrays index overlapping segments of memory"); + } + + dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(dst, dst_nelems); + + int src_typenum = src.get_typenum(); + int dst_typenum = dst.get_typenum(); + + namespace td_ns = dpctl::tensor::type_dispatch; + const auto &array_types = td_ns::usm_ndarray_types(); + int src_typeid = array_types.typenum_to_lookup_id(src_typenum); + int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum); + + void *data_ptr = dst.get_data(); + const auto &ctx = exec_q.get_context(); + auto usm_type = sycl::get_pointer_type(data_ptr, ctx); + + bool supports_atomics = check_atomic_support[dst_typeid](exec_q, usm_type); + + // handle special case when both reduction and iteration are 1D contiguous + bool is_src_c_contig = src.is_c_contiguous(); + bool is_dst_c_contig = dst.is_c_contiguous(); + bool is_src_f_contig = src.is_f_contiguous(); + + if ((is_src_c_contig && is_dst_c_contig) || + (is_src_f_contig && dst_nelems == 1)) { + // remove_all_extents gets underlying type of table + using contig_fn_ptr_T = + typename std::remove_all_extents::type; + contig_fn_ptr_T fn; + if (supports_atomics) { + fn = axis1_atomic_dispatch_table[src_typeid][dst_typeid]; + } + else { + fn = axis1_temps_dispatch_table[src_typeid][dst_typeid]; + } + if (fn != nullptr) { + std::size_t iter_nelems = dst_nelems; + + static constexpr py::ssize_t zero_offset = 0; + + sycl::event reduction_over_axis_contig_ev = + fn(exec_q, iter_nelems, reduction_nelems, src.get_data(), + dst.get_data(), + zero_offset, // iteration_src_offset + zero_offset, // iteration_dst_offset + zero_offset, // reduction_src_offset + depends); + + sycl::event keep_args_event = dpctl::utils::keep_args_alive( + exec_q, {src, dst}, {reduction_over_axis_contig_ev}); + + return std::make_pair(keep_args_event, + reduction_over_axis_contig_ev); + } + } + else if (is_src_f_contig && + ((is_dst_c_contig && dst_nd == 1) || dst.is_f_contiguous())) { + // remove_all_extents gets underlying type of table + using contig_fn_ptr_T = + typename std::remove_all_extents::type; + contig_fn_ptr_T fn; + if (supports_atomics) { + fn = axis0_atomic_dispatch_table[src_typeid][dst_typeid]; + } + else { + fn = axis0_temps_dispatch_table[src_typeid][dst_typeid]; + } + if (fn != nullptr) { + std::size_t iter_nelems = dst_nelems; + + static constexpr py::ssize_t zero_offset = 0; + + sycl::event reduction_over_axis_contig_ev = + fn(exec_q, iter_nelems, reduction_nelems, src.get_data(), + dst.get_data(), + zero_offset, // iteration_src_offset + zero_offset, // iteration_dst_offset + zero_offset, // reduction_src_offset + depends); + + sycl::event keep_args_event = dpctl::utils::keep_args_alive( + exec_q, {src, dst}, {reduction_over_axis_contig_ev}); + + return std::make_pair(keep_args_event, + reduction_over_axis_contig_ev); + } + } + + // TODO: not used anywhere + auto const &src_shape_vecs = src.get_shape_vector(); + auto const &src_strides_vecs = src.get_strides_vector(); + auto const &dst_strides_vecs = dst.get_strides_vector(); + + int reduction_nd = trailing_dims_to_reduce; + const py::ssize_t *reduction_shape_ptr = src_shape_ptr + dst_nd; + using shT = std::vector; + shT reduction_src_strides(std::begin(src_strides_vecs) + dst_nd, + std::end(src_strides_vecs)); + + shT simplified_reduction_shape; + shT simplified_reduction_src_strides; + py::ssize_t reduction_src_offset(0); + + simplify_iteration_space_1( + reduction_nd, reduction_shape_ptr, reduction_src_strides, + // output + simplified_reduction_shape, simplified_reduction_src_strides, + reduction_src_offset); + + const py::ssize_t *iteration_shape_ptr = src_shape_ptr; + + shT iteration_src_strides(std::begin(src_strides_vecs), + std::begin(src_strides_vecs) + iteration_nd); + shT const &iteration_dst_strides = dst_strides_vecs; + + shT simplified_iteration_shape; + shT simplified_iteration_src_strides; + shT simplified_iteration_dst_strides; + py::ssize_t iteration_src_offset(0); + py::ssize_t iteration_dst_offset(0); + + if (iteration_nd == 0) { + if (dst_nelems != 1) { + throw std::runtime_error("iteration_nd == 0, but dst_nelems != 1"); + } + iteration_nd = 1; + simplified_iteration_shape.push_back(1); + simplified_iteration_src_strides.push_back(0); + simplified_iteration_dst_strides.push_back(0); + } + else { + simplify_iteration_space(iteration_nd, iteration_shape_ptr, + iteration_src_strides, iteration_dst_strides, + // output + simplified_iteration_shape, + simplified_iteration_src_strides, + simplified_iteration_dst_strides, + iteration_src_offset, iteration_dst_offset); + } + + if ((reduction_nd == 1) && (iteration_nd == 1)) { + bool mat_reduce_over_axis1 = false; + bool mat_reduce_over_axis0 = false; + bool array_reduce_all_elems = false; + std::size_t iter_nelems = dst_nelems; + + if (simplified_reduction_src_strides[0] == 1) { + array_reduce_all_elems = (simplified_iteration_shape[0] == 1); + mat_reduce_over_axis1 = + (simplified_iteration_dst_strides[0] == 1) && + (static_cast( + simplified_iteration_src_strides[0]) == reduction_nelems); + } + else if (static_cast( + simplified_reduction_src_strides[0]) == iter_nelems) { + mat_reduce_over_axis0 = + (simplified_iteration_dst_strides[0] == 1) && + (simplified_iteration_src_strides[0] == 1); + } + + if (mat_reduce_over_axis1 || array_reduce_all_elems) { + using contig_fn_ptr_T = + typename std::remove_all_extents::type; + contig_fn_ptr_T fn; + if (supports_atomics) { + fn = axis1_atomic_dispatch_table[src_typeid][dst_typeid]; + } + else { + fn = axis1_temps_dispatch_table[src_typeid][dst_typeid]; + } + if (fn != nullptr) { + sycl::event reduction_over_axis1_contig_ev = + fn(exec_q, iter_nelems, reduction_nelems, src.get_data(), + dst.get_data(), iteration_src_offset, + iteration_dst_offset, reduction_src_offset, depends); + + sycl::event keep_args_event = dpctl::utils::keep_args_alive( + exec_q, {src, dst}, {reduction_over_axis1_contig_ev}); + + return std::make_pair(keep_args_event, + reduction_over_axis1_contig_ev); + } + } + else if (mat_reduce_over_axis0) { + using contig_fn_ptr_T = + typename std::remove_all_extents::type; + contig_fn_ptr_T fn; + if (supports_atomics) { + fn = axis0_atomic_dispatch_table[src_typeid][dst_typeid]; + } + else { + fn = axis0_temps_dispatch_table[src_typeid][dst_typeid]; + } + if (fn != nullptr) { + sycl::event reduction_over_axis0_contig_ev = + fn(exec_q, iter_nelems, reduction_nelems, src.get_data(), + dst.get_data(), iteration_src_offset, + iteration_dst_offset, reduction_src_offset, depends); + + sycl::event keep_args_event = dpctl::utils::keep_args_alive( + exec_q, {src, dst}, {reduction_over_axis0_contig_ev}); + + return std::make_pair(keep_args_event, + reduction_over_axis0_contig_ev); + } + } + } + + // remove_all_extents gets underlying type of table + using strided_fn_ptr_T = + typename std::remove_all_extents::type; + strided_fn_ptr_T fn = nullptr; + + if (supports_atomics) { + fn = atomic_dispatch_table[src_typeid][dst_typeid]; + } + + if (fn == nullptr) { + // use slower reduction implementation using temporaries + fn = temps_dispatch_table[src_typeid][dst_typeid]; + if (fn == nullptr) { + throw std::runtime_error("Datatypes are not supported"); + } + } + + std::vector host_task_events{}; + using dpctl::tensor::offset_utils::device_allocate_and_pack; + auto arrays_metainfo_packing_triple_ = + device_allocate_and_pack( + exec_q, host_task_events, + // iteration metadata + simplified_iteration_shape, simplified_iteration_src_strides, + simplified_iteration_dst_strides, + // reduction metadata + simplified_reduction_shape, simplified_reduction_src_strides); + auto tmp_alloc_owner = + std::move(std::get<0>(arrays_metainfo_packing_triple_)); + const auto ©_metadata_ev = std::get<2>(arrays_metainfo_packing_triple_); + const py::ssize_t *temp_allocation_ptr = tmp_alloc_owner.get(); + + const py::ssize_t *iter_shape_and_strides = temp_allocation_ptr; + const py::ssize_t *reduction_shape_stride = + temp_allocation_ptr + 3 * simplified_iteration_shape.size(); + + std::vector all_deps; + all_deps.reserve(depends.size() + 1); + all_deps.resize(depends.size()); + std::copy(depends.begin(), depends.end(), all_deps.begin()); + all_deps.push_back(copy_metadata_ev); + + auto reduction_ev = + fn(exec_q, dst_nelems, reduction_nelems, src.get_data(), dst.get_data(), + iteration_nd, iter_shape_and_strides, iteration_src_offset, + iteration_dst_offset, + reduction_nd, // number dimensions being reduced + reduction_shape_stride, reduction_src_offset, all_deps); + + sycl::event temp_cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {reduction_ev}, tmp_alloc_owner); + host_task_events.push_back(temp_cleanup_ev); + + sycl::event keep_args_event = + dpctl::utils::keep_args_alive(exec_q, {src, dst}, host_task_events); + + return std::make_pair(keep_args_event, reduction_ev); +} + +/* ================= No atomic reductions ====================== */ + +/*! @brief Template implementing Python API for reduction over axis without + * atomics */ +template +std::pair py_tree_reduction_over_axis( + const dpctl::tensor::usm_ndarray &src, + int trailing_dims_to_reduce, // comp over this many trailing indexes + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends, + const strided_fnT &temps_dispatch_table, + const contig_fnT &axis0_temps_dispatch_table, + const contig_fnT &axis1_temps_dispatch_table) +{ + int src_nd = src.get_ndim(); + int iteration_nd = src_nd - trailing_dims_to_reduce; + if (trailing_dims_to_reduce <= 0 || iteration_nd < 0) { + throw py::value_error("Trailing_dim_to_reduce must be positive, but no " + "greater than rank of the array being reduced"); + } + + int dst_nd = dst.get_ndim(); + if (dst_nd != iteration_nd) { + throw py::value_error("Destination array rank does not match input " + "array rank and number of reduced dimensions"); + } + + const py::ssize_t *src_shape_ptr = src.get_shape_raw(); + const py::ssize_t *dst_shape_ptr = dst.get_shape_raw(); + + bool same_shapes = true; + for (int i = 0; same_shapes && (i < dst_nd); ++i) { + same_shapes = same_shapes && (src_shape_ptr[i] == dst_shape_ptr[i]); + } + + if (!same_shapes) { + throw py::value_error("Destination shape does not match unreduced " + "dimensions of the input shape"); + } + + if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst); + + std::size_t dst_nelems = dst.get_size(); + + if (dst_nelems == 0) { + return std::make_pair(sycl::event(), sycl::event()); + } + + std::size_t reduction_nelems(1); + for (int i = dst_nd; i < src_nd; ++i) { + reduction_nelems *= static_cast(src_shape_ptr[i]); + } + + // check that dst and src do not overlap + auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); + if (overlap(src, dst)) { + throw py::value_error("Arrays index overlapping segments of memory"); + } + + dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(dst, dst_nelems); + + int src_typenum = src.get_typenum(); + int dst_typenum = dst.get_typenum(); + + namespace td_ns = dpctl::tensor::type_dispatch; + const auto &array_types = td_ns::usm_ndarray_types(); + int src_typeid = array_types.typenum_to_lookup_id(src_typenum); + int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum); + + // handle special case when both reduction and iteration are 1D contiguous + bool is_src_c_contig = src.is_c_contiguous(); + bool is_dst_c_contig = dst.is_c_contiguous(); + bool is_src_f_contig = src.is_f_contiguous(); + + if ((is_src_c_contig && is_dst_c_contig) || + (is_src_f_contig && dst_nelems == 1)) { + auto fn = axis1_temps_dispatch_table[src_typeid][dst_typeid]; + if (fn != nullptr) { + std::size_t iter_nelems = dst_nelems; + + static constexpr py::ssize_t zero_offset = 0; + + sycl::event reduction_over_axis_contig_ev = + fn(exec_q, iter_nelems, reduction_nelems, src.get_data(), + dst.get_data(), + zero_offset, // iteration_src_offset + zero_offset, // iteration_dst_offset + zero_offset, // reduction_src_offset + depends); + + sycl::event keep_args_event = dpctl::utils::keep_args_alive( + exec_q, {src, dst}, {reduction_over_axis_contig_ev}); + + return std::make_pair(keep_args_event, + reduction_over_axis_contig_ev); + } + } + else if (is_src_f_contig && + ((is_dst_c_contig && dst_nd == 1) || dst.is_f_contiguous())) { + auto fn = axis0_temps_dispatch_table[src_typeid][dst_typeid]; + if (fn != nullptr) { + std::size_t iter_nelems = dst_nelems; + + static constexpr py::ssize_t zero_offset = 0; + + sycl::event reduction_over_axis_contig_ev = + fn(exec_q, iter_nelems, reduction_nelems, src.get_data(), + dst.get_data(), + zero_offset, // iteration_src_offset + zero_offset, // iteration_dst_offset + zero_offset, // reduction_src_offset + depends); + + sycl::event keep_args_event = dpctl::utils::keep_args_alive( + exec_q, {src, dst}, {reduction_over_axis_contig_ev}); + + return std::make_pair(keep_args_event, + reduction_over_axis_contig_ev); + } + } + + auto const &src_shape_vecs = src.get_shape_vector(); + auto const &src_strides_vecs = src.get_strides_vector(); + auto const &dst_strides_vecs = dst.get_strides_vector(); + + int reduction_nd = trailing_dims_to_reduce; + const py::ssize_t *reduction_shape_ptr = src_shape_ptr + dst_nd; + using shT = std::vector; + shT reduction_src_strides(std::begin(src_strides_vecs) + dst_nd, + std::end(src_strides_vecs)); + + shT simplified_reduction_shape; + shT simplified_reduction_src_strides; + py::ssize_t reduction_src_offset(0); + + simplify_iteration_space_1( + reduction_nd, reduction_shape_ptr, reduction_src_strides, + // output + simplified_reduction_shape, simplified_reduction_src_strides, + reduction_src_offset); + + const py::ssize_t *iteration_shape_ptr = src_shape_ptr; + + shT iteration_src_strides(std::begin(src_strides_vecs), + std::begin(src_strides_vecs) + iteration_nd); + shT const &iteration_dst_strides = dst_strides_vecs; + + shT simplified_iteration_shape; + shT simplified_iteration_src_strides; + shT simplified_iteration_dst_strides; + py::ssize_t iteration_src_offset(0); + py::ssize_t iteration_dst_offset(0); + + if (iteration_nd == 0) { + if (dst_nelems != 1) { + throw std::runtime_error("iteration_nd == 0, but dst_nelems != 1"); + } + iteration_nd = 1; + simplified_iteration_shape.push_back(1); + simplified_iteration_src_strides.push_back(0); + simplified_iteration_dst_strides.push_back(0); + } + else { + simplify_iteration_space(iteration_nd, iteration_shape_ptr, + iteration_src_strides, iteration_dst_strides, + // output + simplified_iteration_shape, + simplified_iteration_src_strides, + simplified_iteration_dst_strides, + iteration_src_offset, iteration_dst_offset); + } + + if ((reduction_nd == 1) && (iteration_nd == 1)) { + bool mat_reduce_over_axis1 = false; + bool mat_reduce_over_axis0 = false; + bool array_reduce_all_elems = false; + std::size_t iter_nelems = dst_nelems; + + if (simplified_reduction_src_strides[0] == 1) { + array_reduce_all_elems = (simplified_iteration_shape[0] == 1); + mat_reduce_over_axis1 = + (simplified_iteration_dst_strides[0] == 1) && + (static_cast( + simplified_iteration_src_strides[0]) == reduction_nelems); + } + else if (static_cast( + simplified_reduction_src_strides[0]) == iter_nelems) { + mat_reduce_over_axis0 = + (simplified_iteration_dst_strides[0] == 1) && + (simplified_iteration_src_strides[0] == 1); + } + + if (mat_reduce_over_axis1 || array_reduce_all_elems) { + auto fn = axis1_temps_dispatch_table[src_typeid][dst_typeid]; + if (fn != nullptr) { + sycl::event reduction_over_axis1_contig_ev = + fn(exec_q, iter_nelems, reduction_nelems, src.get_data(), + dst.get_data(), iteration_src_offset, + iteration_dst_offset, reduction_src_offset, depends); + + sycl::event keep_args_event = dpctl::utils::keep_args_alive( + exec_q, {src, dst}, {reduction_over_axis1_contig_ev}); + + return std::make_pair(keep_args_event, + reduction_over_axis1_contig_ev); + } + } + else if (mat_reduce_over_axis0) { + auto fn = axis0_temps_dispatch_table[src_typeid][dst_typeid]; + if (fn != nullptr) { + sycl::event reduction_over_axis0_contig_ev = + fn(exec_q, iter_nelems, reduction_nelems, src.get_data(), + dst.get_data(), iteration_src_offset, + iteration_dst_offset, reduction_src_offset, depends); + + sycl::event keep_args_event = dpctl::utils::keep_args_alive( + exec_q, {src, dst}, {reduction_over_axis0_contig_ev}); + + return std::make_pair(keep_args_event, + reduction_over_axis0_contig_ev); + } + } + } + + auto fn = temps_dispatch_table[src_typeid][dst_typeid]; + if (fn == nullptr) { + throw std::runtime_error("Datatypes are not supported"); + } + + std::vector host_task_events{}; + using dpctl::tensor::offset_utils::device_allocate_and_pack; + auto arrays_metainfo_packing_triple_ = + device_allocate_and_pack( + exec_q, host_task_events, + // iteration metadata + simplified_iteration_shape, simplified_iteration_src_strides, + simplified_iteration_dst_strides, + // reduction metadata + simplified_reduction_shape, simplified_reduction_src_strides); + auto tmp_owner = std::move(std::get<0>(arrays_metainfo_packing_triple_)); + const auto ©_metadata_ev = std::get<2>(arrays_metainfo_packing_triple_); + const py::ssize_t *temp_allocation_ptr = tmp_owner.get(); + + const py::ssize_t *iter_shape_and_strides = temp_allocation_ptr; + const py::ssize_t *reduction_shape_stride = + temp_allocation_ptr + 3 * simplified_iteration_shape.size(); + + std::vector all_deps; + all_deps.reserve(depends.size() + 1); + all_deps.resize(depends.size()); + std::copy(depends.begin(), depends.end(), all_deps.begin()); + all_deps.push_back(copy_metadata_ev); + + auto reduction_ev = + fn(exec_q, dst_nelems, reduction_nelems, src.get_data(), dst.get_data(), + iteration_nd, iter_shape_and_strides, iteration_src_offset, + iteration_dst_offset, + reduction_nd, // number dimensions being reduced + reduction_shape_stride, reduction_src_offset, all_deps); + + sycl::event temp_cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {reduction_ev}, tmp_owner); + host_task_events.push_back(temp_cleanup_ev); + + sycl::event keep_args_event = + dpctl::utils::keep_args_alive(exec_q, {src, dst}, host_task_events); + + return std::make_pair(keep_args_event, reduction_ev); +} + +/*! @brief Template implementing Python API for searching over an axis */ +template +std::pair py_search_over_axis( + const dpctl::tensor::usm_ndarray &src, + int trailing_dims_to_reduce, // comp over this many trailing indexes + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends, + const strided_fnT &strided_dispatch_table, + const contig_fnT &axis0_contig_dispatch_table, + const contig_fnT &axis1_contig_dispatch_table) +{ + int src_nd = src.get_ndim(); + int iteration_nd = src_nd - trailing_dims_to_reduce; + if (trailing_dims_to_reduce <= 0 || iteration_nd < 0) { + throw py::value_error("Trailing_dim_to_reduce must be positive, but no " + "greater than rank of the array being reduced"); + } + + int dst_nd = dst.get_ndim(); + if (dst_nd != iteration_nd) { + throw py::value_error("Destination array rank does not match input " + "array rank and number of reduced dimensions"); + } + + const py::ssize_t *src_shape_ptr = src.get_shape_raw(); + const py::ssize_t *dst_shape_ptr = dst.get_shape_raw(); + + bool same_shapes = true; + for (int i = 0; same_shapes && (i < dst_nd); ++i) { + same_shapes = same_shapes && (src_shape_ptr[i] == dst_shape_ptr[i]); + } + + if (!same_shapes) { + throw py::value_error("Destination shape does not match unreduced " + "dimensions of the input shape"); + } + + if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst); + + std::size_t dst_nelems = dst.get_size(); + + if (dst_nelems == 0) { + return std::make_pair(sycl::event(), sycl::event()); + } + + std::size_t reduction_nelems(1); + for (int i = dst_nd; i < src_nd; ++i) { + reduction_nelems *= static_cast(src_shape_ptr[i]); + } + + // check that dst and src do not overlap + auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); + if (overlap(src, dst)) { + throw py::value_error("Arrays index overlapping segments of memory"); + } + + dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(dst, dst_nelems); + + int src_typenum = src.get_typenum(); + int dst_typenum = dst.get_typenum(); + + namespace td_ns = dpctl::tensor::type_dispatch; + const auto &array_types = td_ns::usm_ndarray_types(); + int src_typeid = array_types.typenum_to_lookup_id(src_typenum); + int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum); + + // handle special case when both reduction and iteration are 1D contiguous + bool is_src_c_contig = src.is_c_contiguous(); + bool is_dst_c_contig = dst.is_c_contiguous(); + bool is_src_f_contig = src.is_f_contiguous(); + + if (is_src_c_contig && is_dst_c_contig) { + auto fn = axis1_contig_dispatch_table[src_typeid][dst_typeid]; + if (fn != nullptr) { + std::size_t iter_nelems = dst_nelems; + + static constexpr py::ssize_t zero_offset = 0; + + sycl::event reduction_over_axis_contig_ev = + fn(exec_q, iter_nelems, reduction_nelems, src.get_data(), + dst.get_data(), + zero_offset, // iteration_src_offset + zero_offset, // iteration_dst_offset + zero_offset, // reduction_src_offset + depends); + + sycl::event keep_args_event = dpctl::utils::keep_args_alive( + exec_q, {src, dst}, {reduction_over_axis_contig_ev}); + + return std::make_pair(keep_args_event, + reduction_over_axis_contig_ev); + } + } + else if (is_src_f_contig && dst_nd == 1) { + auto fn = axis0_contig_dispatch_table[src_typeid][dst_typeid]; + if (fn != nullptr) { + std::size_t iter_nelems = dst_nelems; + + static constexpr py::ssize_t zero_offset = 0; + + sycl::event reduction_over_axis_contig_ev = + fn(exec_q, iter_nelems, reduction_nelems, src.get_data(), + dst.get_data(), + zero_offset, // iteration_src_offset + zero_offset, // iteration_dst_offset + zero_offset, // reduction_src_offset + depends); + + sycl::event keep_args_event = dpctl::utils::keep_args_alive( + exec_q, {src, dst}, {reduction_over_axis_contig_ev}); + + return std::make_pair(keep_args_event, + reduction_over_axis_contig_ev); + } + } + + auto const &src_shape_vecs = src.get_shape_vector(); + auto const &src_strides_vecs = src.get_strides_vector(); + auto const &dst_strides_vecs = dst.get_strides_vector(); + + int reduction_nd = trailing_dims_to_reduce; + const py::ssize_t *reduction_shape_ptr = src_shape_ptr + dst_nd; + using shT = std::vector; + shT reduction_src_strides(std::begin(src_strides_vecs) + dst_nd, + std::end(src_strides_vecs)); + + shT compact_reduction_shape; + shT compact_reduction_src_strides; + py::ssize_t reduction_src_offset(0); + + // TODO: not used anywhere + compact_iteration_space( + reduction_nd, reduction_shape_ptr, reduction_src_strides, + // output + compact_reduction_shape, compact_reduction_src_strides); + + const py::ssize_t *iteration_shape_ptr = src_shape_ptr; + + shT iteration_src_strides(std::begin(src_strides_vecs), + std::begin(src_strides_vecs) + iteration_nd); + shT const &iteration_dst_strides = dst_strides_vecs; + + shT simplified_iteration_shape; + shT simplified_iteration_src_strides; + shT simplified_iteration_dst_strides; + py::ssize_t iteration_src_offset(0); + py::ssize_t iteration_dst_offset(0); + + if (iteration_nd == 0) { + if (dst_nelems != 1) { + throw std::runtime_error("iteration_nd == 0, but dst_nelems != 1"); + } + iteration_nd = 1; + simplified_iteration_shape.push_back(1); + simplified_iteration_src_strides.push_back(0); + simplified_iteration_dst_strides.push_back(0); + } + else { + simplify_iteration_space(iteration_nd, iteration_shape_ptr, + iteration_src_strides, iteration_dst_strides, + // output + simplified_iteration_shape, + simplified_iteration_src_strides, + simplified_iteration_dst_strides, + iteration_src_offset, iteration_dst_offset); + } + + if ((reduction_nd == 1) && (iteration_nd == 1)) { + bool mat_reduce_over_axis1 = false; + bool mat_reduce_over_axis0 = false; + std::size_t iter_nelems = dst_nelems; + + if (compact_reduction_src_strides[0] == 1) { + mat_reduce_over_axis1 = + (simplified_iteration_dst_strides[0] == 1) && + (static_cast( + simplified_iteration_src_strides[0]) == reduction_nelems); + } + else if (static_cast(compact_reduction_src_strides[0]) == + iter_nelems) { + mat_reduce_over_axis0 = + (simplified_iteration_dst_strides[0] == 1) && + (simplified_iteration_src_strides[0] == 1); + } + + if (mat_reduce_over_axis1) { + auto fn = axis1_contig_dispatch_table[src_typeid][dst_typeid]; + if (fn != nullptr) { + sycl::event reduction_over_axis1_contig_ev = + fn(exec_q, iter_nelems, reduction_nelems, src.get_data(), + dst.get_data(), iteration_src_offset, + iteration_dst_offset, reduction_src_offset, depends); + + sycl::event keep_args_event = dpctl::utils::keep_args_alive( + exec_q, {src, dst}, {reduction_over_axis1_contig_ev}); + + return std::make_pair(keep_args_event, + reduction_over_axis1_contig_ev); + } + } + else if (mat_reduce_over_axis0) { + auto fn = axis0_contig_dispatch_table[src_typeid][dst_typeid]; + if (fn != nullptr) { + sycl::event reduction_over_axis0_contig_ev = + fn(exec_q, iter_nelems, reduction_nelems, src.get_data(), + dst.get_data(), iteration_src_offset, + iteration_dst_offset, reduction_src_offset, depends); + + sycl::event keep_args_event = dpctl::utils::keep_args_alive( + exec_q, {src, dst}, {reduction_over_axis0_contig_ev}); + + return std::make_pair(keep_args_event, + reduction_over_axis0_contig_ev); + } + } + } + + auto fn = strided_dispatch_table[src_typeid][dst_typeid]; + if (fn == nullptr) { + throw std::runtime_error("Datatypes are not supported"); + } + + std::vector host_task_events{}; + + using dpctl::tensor::offset_utils::device_allocate_and_pack; + + auto arrays_metainfo_packing_triple_ = + device_allocate_and_pack( + exec_q, host_task_events, + // iteration metadata + simplified_iteration_shape, simplified_iteration_src_strides, + simplified_iteration_dst_strides, + // reduction metadata + compact_reduction_shape, compact_reduction_src_strides); + auto tmp_owner = std::move(std::get<0>(arrays_metainfo_packing_triple_)); + const auto ©_metadata_ev = std::get<2>(arrays_metainfo_packing_triple_); + const py::ssize_t *temp_allocation_ptr = tmp_owner.get(); + + const py::ssize_t *iter_shape_and_strides = temp_allocation_ptr; + const py::ssize_t *reduction_shape_stride = + temp_allocation_ptr + 3 * simplified_iteration_shape.size(); + + std::vector all_deps; + all_deps.reserve(depends.size() + 1); + all_deps.resize(depends.size()); + std::copy(depends.begin(), depends.end(), all_deps.begin()); + all_deps.push_back(copy_metadata_ev); + + auto comp_ev = fn(exec_q, dst_nelems, reduction_nelems, src.get_data(), + dst.get_data(), iteration_nd, iter_shape_and_strides, + iteration_src_offset, iteration_dst_offset, + reduction_nd, // number dimensions being reduced + reduction_shape_stride, reduction_src_offset, all_deps); + + sycl::event temp_cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {comp_ev}, tmp_owner); + host_task_events.push_back(temp_cleanup_ev); + + sycl::event keep_args_event = + dpctl::utils::keep_args_alive(exec_q, {src, dst}, host_task_events); + + return std::make_pair(keep_args_event, comp_ev); +} + +/* ================= Atomic only reductions ====================== */ + +/*! @brief Template implementing Python API for boolean reductions over an axis + */ +template +std::pair + py_boolean_reduction(const dpctl::tensor::usm_ndarray &src, + int trailing_dims_to_reduce, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends, + const contig_dispatchT &axis1_contig_dispatch_vector, + const contig_dispatchT &axis0_contig_dispatch_vector, + const strided_dispatchT &strided_dispatch_vector, + const atomic_support_fnT check_atomic_support) +{ + int src_nd = src.get_ndim(); + int iter_nd = src_nd - trailing_dims_to_reduce; + if (trailing_dims_to_reduce <= 0 || iter_nd < 0) { + throw py::value_error("Trailing_dim_to_reduce must be positive, but no " + "greater than rank of the array being reduced"); + } + + int dst_nd = dst.get_ndim(); + if (dst_nd != iter_nd) { + throw py::value_error("Destination array rank does not match input " + "array rank and number of reduced dimensions"); + } + + const py::ssize_t *src_shape_ptr = src.get_shape_raw(); + const py::ssize_t *dst_shape_ptr = dst.get_shape_raw(); + + bool same_shapes = true; + for (int i = 0; same_shapes && (i < dst_nd); ++i) { + same_shapes = same_shapes && (src_shape_ptr[i] == dst_shape_ptr[i]); + } + + if (!same_shapes) { + throw py::value_error("Destination shape does not match unreduced " + "dimensions of the input shape"); + } + + if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst); + + std::size_t dst_nelems = dst.get_size(); + + std::size_t red_nelems(1); + for (int i = dst_nd; i < src_nd; ++i) { + red_nelems *= static_cast(src_shape_ptr[i]); + } + + auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); + if (overlap(dst, src)) { + throw py::value_error("Arrays are expected to have no memory overlap"); + } + + dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(dst, dst_nelems); + + const char *src_data = src.get_data(); + char *dst_data = dst.get_data(); + + int src_typenum = src.get_typenum(); + int dst_typenum = dst.get_typenum(); + + const auto &array_types = td_ns::usm_ndarray_types(); + int src_typeid = array_types.typenum_to_lookup_id(src_typenum); + int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum); + + static constexpr int int32_typeid = + static_cast(td_ns::typenum_t::INT32); + if (dst_typeid != int32_typeid) { + throw py::value_error( + "Unexpected data type of destination array, expecting 'int32'"); + } + + void *data_ptr = dst.get_data(); + const auto &ctx = exec_q.get_context(); + auto usm_type = sycl::get_pointer_type(data_ptr, ctx); + + bool supports_atomics = check_atomic_support(exec_q, usm_type); + if (!supports_atomics) { + throw py::value_error( + "This reduction is not supported for this device and usm_type."); + } + + bool is_src_c_contig = src.is_c_contiguous(); + bool is_src_f_contig = src.is_f_contiguous(); + bool is_dst_c_contig = dst.is_c_contiguous(); + + // TODO: should be dst_nelems == 0? + if ((is_src_c_contig && is_dst_c_contig) || + (is_src_f_contig && dst_nelems == 0)) { + auto fn = axis1_contig_dispatch_vector[src_typeid]; + static constexpr py::ssize_t zero_offset = 0; + + sycl::event red_ev = + fn(exec_q, dst_nelems, red_nelems, src_data, dst_data, zero_offset, + zero_offset, zero_offset, depends); + + sycl::event keep_args_event = + dpctl::utils::keep_args_alive(exec_q, {src, dst}, {red_ev}); + + return std::make_pair(keep_args_event, red_ev); + } + else if (is_src_f_contig && + ((is_dst_c_contig && dst_nd == 1) || dst.is_f_contiguous())) { + auto fn = axis0_contig_dispatch_vector[src_typeid]; + static constexpr py::ssize_t zero_offset = 0; + + sycl::event red_ev = + fn(exec_q, dst_nelems, red_nelems, src_data, dst_data, zero_offset, + zero_offset, zero_offset, depends); + + sycl::event keep_args_event = + dpctl::utils::keep_args_alive(exec_q, {src, dst}, {red_ev}); + + return std::make_pair(keep_args_event, red_ev); + } + + auto src_shape_vecs = src.get_shape_vector(); + auto src_strides_vecs = src.get_strides_vector(); + auto dst_strides_vecs = dst.get_strides_vector(); + + int simplified_red_nd = trailing_dims_to_reduce; + + using shT = std::vector; + shT red_src_strides(std::begin(src_strides_vecs) + dst_nd, + std::end(src_strides_vecs)); + + shT simplified_red_shape; + shT simplified_red_src_strides; + py::ssize_t red_src_offset(0); + + simplify_iteration_space_1( + simplified_red_nd, src_shape_ptr + dst_nd, red_src_strides, + // output + simplified_red_shape, simplified_red_src_strides, red_src_offset); + + shT iter_src_strides(std::begin(src_strides_vecs), + std::begin(src_strides_vecs) + iter_nd); + shT const &iter_dst_strides = dst_strides_vecs; + + shT simplified_iter_shape; + shT simplified_iter_src_strides; + shT simplified_iter_dst_strides; + py::ssize_t iter_src_offset(0); + py::ssize_t iter_dst_offset(0); + + if (iter_nd == 0) { + if (dst_nelems != 1) { + throw std::runtime_error("iteration_nd == 0, but dst_nelems != 1"); + } + iter_nd = 1; + simplified_iter_shape.push_back(1); + simplified_iter_src_strides.push_back(0); + simplified_iter_dst_strides.push_back(0); + } + else { + simplify_iteration_space( + iter_nd, src_shape_ptr, iter_src_strides, iter_dst_strides, + // output + simplified_iter_shape, simplified_iter_src_strides, + simplified_iter_dst_strides, iter_src_offset, iter_dst_offset); + } + + if (simplified_red_nd == 1 && iter_nd == 1) { + bool mat_reduce_over_axis1 = false; + bool mat_reduce_over_axis0 = false; + bool array_reduce_all_elems = false; + std::size_t iter_nelems = dst_nelems; + + if (simplified_red_src_strides[0] == 1) { + array_reduce_all_elems = (simplified_iter_shape[0] == 1); + mat_reduce_over_axis1 = + (simplified_iter_dst_strides[0] == 1) && + (static_cast(simplified_iter_src_strides[0]) == + red_nelems); + } + else if (static_cast(simplified_red_src_strides[0]) == + iter_nelems) { + mat_reduce_over_axis0 = (simplified_iter_dst_strides[0] == 1) && + (simplified_iter_src_strides[0] == 1); + } + if (mat_reduce_over_axis1 || array_reduce_all_elems) { + auto fn = axis1_contig_dispatch_vector[src_typeid]; + + sycl::event red_ev = + fn(exec_q, iter_nelems, red_nelems, src_data, dst_data, + iter_src_offset, iter_dst_offset, red_src_offset, depends); + + sycl::event keep_args_event = + dpctl::utils::keep_args_alive(exec_q, {src, dst}, {red_ev}); + + return std::make_pair(keep_args_event, red_ev); + } + else if (mat_reduce_over_axis0) { + auto fn = axis0_contig_dispatch_vector[src_typeid]; + + sycl::event red_ev = + fn(exec_q, iter_nelems, red_nelems, src_data, dst_data, + iter_src_offset, iter_dst_offset, red_src_offset, depends); + + sycl::event keep_args_event = + dpctl::utils::keep_args_alive(exec_q, {src, dst}, {red_ev}); + + return std::make_pair(keep_args_event, red_ev); + } + } + + auto fn = strided_dispatch_vector[src_typeid]; + + std::vector host_task_events{}; + auto iter_red_metadata_packing_triple_ = + dpctl::tensor::offset_utils::device_allocate_and_pack( + exec_q, host_task_events, simplified_iter_shape, + simplified_iter_src_strides, simplified_iter_dst_strides, + simplified_red_shape, simplified_red_src_strides); + auto packed_shapes_strides_owner = + std::move(std::get<0>(iter_red_metadata_packing_triple_)); + const auto ©_metadata_ev = + std::get<2>(iter_red_metadata_packing_triple_); + const py::ssize_t *packed_shapes_and_strides = + packed_shapes_strides_owner.get(); + + const py::ssize_t *iter_shape_and_strides = packed_shapes_and_strides; + const py::ssize_t *red_shape_stride = + packed_shapes_and_strides + 3 * simplified_iter_shape.size(); + + std::vector all_deps; + all_deps.reserve(depends.size() + 1); + all_deps.resize(depends.size()); + std::copy(depends.begin(), depends.end(), all_deps.begin()); + all_deps.push_back(copy_metadata_ev); + + auto red_ev = + fn(exec_q, dst_nelems, red_nelems, src_data, dst_data, iter_nd, + iter_shape_and_strides, iter_src_offset, iter_dst_offset, + simplified_red_nd, red_shape_stride, red_src_offset, all_deps); + + sycl::event temp_cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {red_ev}, packed_shapes_strides_owner); + host_task_events.push_back(temp_cleanup_ev); + + sycl::event keep_args_event = + dpctl::utils::keep_args_alive(exec_q, {src, dst}, host_task_events); + + return std::make_pair(keep_args_event, red_ev); +} + +extern void init_reduction_functions(py::module_ m); + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/reductions/sum.cpp b/dpnp/tensor/libtensor/source/reductions/sum.cpp new file mode 100644 index 000000000000..9a0d212ed8da --- /dev/null +++ b/dpnp/tensor/libtensor/source/reductions/sum.cpp @@ -0,0 +1,460 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_reductions_impl +/// extension. +//===---------------------------------------------------------------------===// + +#include +#include +#include +#include +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include +#include + +#include "kernels/reductions.hpp" +#include "utils/type_dispatch_building.hpp" + +#include "reduction_atomic_support.hpp" +#include "reduction_over_axis.hpp" + +namespace dpctl::tensor::py_internal +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace impl +{ + +using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr; +static reduction_strided_impl_fn_ptr + sum_over_axis_strided_atomic_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static reduction_strided_impl_fn_ptr + sum_over_axis_strided_temps_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr; +static reduction_contig_impl_fn_ptr + sum_over_axis1_contig_atomic_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static reduction_contig_impl_fn_ptr + sum_over_axis0_contig_atomic_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static reduction_contig_impl_fn_ptr + sum_over_axis1_contig_temps_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static reduction_contig_impl_fn_ptr + sum_over_axis0_contig_temps_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +/* @brief Types supported by plus-reduction code based on atomic_ref */ +template +struct TypePairSupportDataForSumReductionAtomic +{ + + /* value if true a kernel for must be instantiated, false + * otherwise */ + static constexpr bool is_defined = std::disjunction< + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + // input int8 + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + // input uint8 + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + // input int16 + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + // input uint16 + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + // input int32 + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + // input uint32 + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + // input int64 + td_ns::TypePairDefinedEntry, + // input uint64 + td_ns::TypePairDefinedEntry, + // fall-through + td_ns::NotDefinedEntry>::is_defined; +}; + +template +struct TypePairSupportDataForSumReductionTemps +{ + + static constexpr bool is_defined = std::disjunction< + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input int8_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input uint8_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input int16_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input uint16_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input int32_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input uint32_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input int64_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input uint64_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input half + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns:: + TypePairDefinedEntry>, + td_ns::TypePairDefinedEntry>, + + // input float + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry>, + td_ns::TypePairDefinedEntry>, + + // input double + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry>, + + // input std::complex + td_ns::TypePairDefinedEntry, + outTy, + std::complex>, + td_ns::TypePairDefinedEntry, + outTy, + std::complex>, + + td_ns::TypePairDefinedEntry, + outTy, + std::complex>, + + // fall-through + td_ns::NotDefinedEntry>::is_defined; +}; + +template +struct SumOverAxisAtomicStridedFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForSumReductionAtomic< + srcTy, dstTy>::is_defined) { + using ReductionOpT = sycl::plus; + return dpctl::tensor::kernels:: + reduction_over_group_with_atomics_strided_impl; + } + else { + return nullptr; + } + } +}; + +template +struct SumOverAxisTempsStridedFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForSumReductionTemps< + srcTy, dstTy>::is_defined) { + using ReductionOpT = + std::conditional_t, + sycl::logical_or, sycl::plus>; + return dpctl::tensor::kernels:: + reduction_over_group_temps_strided_impl; + } + else { + return nullptr; + } + } +}; + +template +struct SumOverAxis1AtomicContigFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForSumReductionAtomic< + srcTy, dstTy>::is_defined) { + using ReductionOpT = sycl::plus; + return dpctl::tensor::kernels:: + reduction_axis1_over_group_with_atomics_contig_impl< + srcTy, dstTy, ReductionOpT>; + } + else { + return nullptr; + } + } +}; + +template +struct SumOverAxis0AtomicContigFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForSumReductionAtomic< + srcTy, dstTy>::is_defined) { + using ReductionOpT = sycl::plus; + return dpctl::tensor::kernels:: + reduction_axis0_over_group_with_atomics_contig_impl< + srcTy, dstTy, ReductionOpT>; + } + else { + return nullptr; + } + } +}; + +template +struct SumOverAxis1TempsContigFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForSumReductionTemps< + srcTy, dstTy>::is_defined) { + using ReductionOpT = + std::conditional_t, + sycl::logical_or, sycl::plus>; + return dpctl::tensor::kernels:: + reduction_axis1_over_group_temps_contig_impl; + } + else { + return nullptr; + } + } +}; + +template +struct SumOverAxis0TempsContigFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForSumReductionTemps< + srcTy, dstTy>::is_defined) { + using ReductionOpT = + std::conditional_t, + sycl::logical_or, sycl::plus>; + return dpctl::tensor::kernels:: + reduction_axis0_over_group_temps_contig_impl; + } + else { + return nullptr; + } + } +}; + +void populate_sum_over_axis_dispatch_tables(void) +{ + using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr; + using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr; + using namespace td_ns; + + DispatchTableBuilder + dtb1; + dtb1.populate_dispatch_table(sum_over_axis_strided_atomic_dispatch_table); + + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(sum_over_axis_strided_temps_dispatch_table); + + DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table(sum_over_axis1_contig_atomic_dispatch_table); + + DispatchTableBuilder + dtb4; + dtb4.populate_dispatch_table(sum_over_axis0_contig_atomic_dispatch_table); + + DispatchTableBuilder + dtb5; + dtb5.populate_dispatch_table(sum_over_axis1_contig_temps_dispatch_table); + + DispatchTableBuilder + dtb6; + dtb6.populate_dispatch_table(sum_over_axis0_contig_temps_dispatch_table); +} + +using atomic_support::atomic_support_fn_ptr_t; +static atomic_support_fn_ptr_t sum_atomic_support_vector[td_ns::num_types]; + +void populate_sum_atomic_support_dispatch_vector(void) +{ + using td_ns::DispatchVectorBuilder; + + using atomic_support::SumAtomicSupportFactory; + DispatchVectorBuilder + dvb; + dvb.populate_dispatch_vector(sum_atomic_support_vector); +} + +} // namespace impl + +void init_sum(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + using impl::populate_sum_over_axis_dispatch_tables; + populate_sum_over_axis_dispatch_tables(); + using impl::sum_over_axis0_contig_atomic_dispatch_table; + using impl::sum_over_axis0_contig_temps_dispatch_table; + using impl::sum_over_axis1_contig_atomic_dispatch_table; + using impl::sum_over_axis1_contig_temps_dispatch_table; + using impl::sum_over_axis_strided_atomic_dispatch_table; + using impl::sum_over_axis_strided_temps_dispatch_table; + + using impl::populate_sum_atomic_support_dispatch_vector; + populate_sum_atomic_support_dispatch_vector(); + using impl::sum_atomic_support_vector; + + auto sum_pyapi = [&](const arrayT &src, int trailing_dims_to_reduce, + const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_reduction_over_axis( + src, trailing_dims_to_reduce, dst, exec_q, depends, + sum_over_axis_strided_atomic_dispatch_table, + sum_over_axis0_contig_atomic_dispatch_table, + sum_over_axis1_contig_atomic_dispatch_table, + sum_over_axis_strided_temps_dispatch_table, + sum_over_axis0_contig_temps_dispatch_table, + sum_over_axis1_contig_temps_dispatch_table, + sum_atomic_support_vector); + }; + m.def("_sum_over_axis", sum_pyapi, "", py::arg("src"), + py::arg("trailing_dims_to_reduce"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto sum_dtype_supported = + [&](const py::dtype &input_dtype, const py::dtype &output_dtype, + const std::string &dst_usm_type, sycl::queue &q) { + return py_reduction_dtype_supported( + input_dtype, output_dtype, dst_usm_type, q, + sum_over_axis_strided_atomic_dispatch_table, + sum_over_axis_strided_temps_dispatch_table, + sum_atomic_support_vector); + }; + m.def("_sum_over_axis_dtype_supported", sum_dtype_supported, "", + py::arg("arg_dtype"), py::arg("out_dtype"), + py::arg("dst_usm_type"), py::arg("sycl_queue")); + } +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/reductions/sum.hpp b/dpnp/tensor/libtensor/source/reductions/sum.hpp new file mode 100644 index 000000000000..08add902a049 --- /dev/null +++ b/dpnp/tensor/libtensor/source/reductions/sum.hpp @@ -0,0 +1,46 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_reductions_impl +/// extension. +//===---------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern void init_sum(py::module_ m); + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/repeat.cpp b/dpnp/tensor/libtensor/source/repeat.cpp new file mode 100644 index 000000000000..b809160e257b --- /dev/null +++ b/dpnp/tensor/libtensor/source/repeat.cpp @@ -0,0 +1,819 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===--------------------------------------------------------------------===// + +#include +#include +#include +#include +#include +#include + +#include + +#include "dpnp4pybind11.hpp" +#include + +#include "kernels/repeat.hpp" +#include "utils/memory_overlap.hpp" +#include "utils/offset_utils.hpp" +#include "utils/output_validation.hpp" +#include "utils/sycl_alloc_utils.hpp" +#include "utils/type_dispatch.hpp" + +#include "simplify_iteration_space.hpp" + +namespace dpctl::tensor::py_internal +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +using dpctl::tensor::kernels::repeat::repeat_by_sequence_fn_ptr_t; +static repeat_by_sequence_fn_ptr_t + repeat_by_sequence_dispatch_vector[td_ns::num_types]; + +using dpctl::tensor::kernels::repeat::repeat_by_sequence_1d_fn_ptr_t; +static repeat_by_sequence_1d_fn_ptr_t + repeat_by_sequence_1d_dispatch_vector[td_ns::num_types]; + +using dpctl::tensor::kernels::repeat::repeat_by_scalar_fn_ptr_t; +static repeat_by_scalar_fn_ptr_t + repeat_by_scalar_dispatch_vector[td_ns::num_types]; + +using dpctl::tensor::kernels::repeat::repeat_by_scalar_1d_fn_ptr_t; +static repeat_by_scalar_1d_fn_ptr_t + repeat_by_scalar_1d_dispatch_vector[td_ns::num_types]; + +void init_repeat_dispatch_vectors(void) +{ + using dpctl::tensor::kernels::repeat::RepeatSequenceFactory; + td_ns::DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(repeat_by_sequence_dispatch_vector); + + using dpctl::tensor::kernels::repeat::RepeatSequence1DFactory; + td_ns::DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(repeat_by_sequence_1d_dispatch_vector); + + using dpctl::tensor::kernels::repeat::RepeatScalarFactory; + td_ns::DispatchVectorBuilder + dvb3; + dvb3.populate_dispatch_vector(repeat_by_scalar_dispatch_vector); + + using dpctl::tensor::kernels::repeat::RepeatScalar1DFactory; + td_ns::DispatchVectorBuilder + dvb4; + dvb4.populate_dispatch_vector(repeat_by_scalar_1d_dispatch_vector); +} + +std::pair + py_repeat_by_sequence(const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &dst, + const dpctl::tensor::usm_ndarray &reps, + const dpctl::tensor::usm_ndarray &cumsum, + int axis, + sycl::queue &exec_q, + const std::vector &depends) +{ + int src_nd = src.get_ndim(); + if (axis < 0 || (axis + 1 > src_nd && src_nd > 0) || + (axis > 0 && src_nd == 0)) { + throw py::value_error("Specified axis is invalid."); + } + + int dst_nd = dst.get_ndim(); + if ((src_nd != dst_nd && src_nd > 0) || (src_nd == 0 && dst_nd > 1)) { + throw py::value_error("Number of dimensions of source and destination " + "arrays is not consistent"); + } + + int reps_nd = reps.get_ndim(); + if (reps_nd != 1) { + throw py::value_error("`reps` array must be 1-dimensional"); + } + + if (cumsum.get_ndim() != 1) { + throw py::value_error("`cumsum` array must be 1-dimensional."); + } + + if (!cumsum.is_c_contiguous()) { + throw py::value_error("Expecting `cumsum` array to be C-contiguous."); + } + + if (!dpctl::utils::queues_are_compatible(exec_q, + {src, reps, cumsum, dst})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst); + + std::size_t reps_sz = reps.get_size(); + std::size_t cumsum_sz = cumsum.get_size(); + + const py::ssize_t *src_shape = src.get_shape_raw(); + const py::ssize_t *dst_shape = dst.get_shape_raw(); + bool same_orthog_dims(true); + std::size_t orthog_nelems(1); // number of orthogonal iterations + for (auto i = 0; i < axis; ++i) { + auto src_sh_i = src_shape[i]; + orthog_nelems *= src_sh_i; + same_orthog_dims = same_orthog_dims && (src_sh_i == dst_shape[i]); + } + for (auto i = axis + 1; i < src_nd; ++i) { + auto src_sh_i = src_shape[i]; + orthog_nelems *= src_sh_i; + same_orthog_dims = same_orthog_dims && (src_sh_i == dst_shape[i]); + } + + std::size_t src_axis_nelems(1); + if (src_nd > 0) { + src_axis_nelems = src_shape[axis]; + } + std::size_t dst_axis_nelems(dst_shape[axis]); + + // shape at repeated axis must be equal to the sum of reps + if (!same_orthog_dims || src_axis_nelems != reps_sz || + src_axis_nelems != cumsum_sz) { + throw py::value_error("Inconsistent array dimensions"); + } + + if (orthog_nelems == 0 || src_axis_nelems == 0) { + return std::make_pair(sycl::event(), sycl::event()); + } + + dpctl::tensor::validation::AmpleMemory::throw_if_not_ample( + dst, orthog_nelems * dst_axis_nelems); + + auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); + // check that dst does not intersect with src or reps + if (overlap(dst, src) || overlap(dst, reps) || overlap(dst, cumsum)) { + throw py::value_error("Destination array overlaps with inputs"); + } + + int src_typenum = src.get_typenum(); + int dst_typenum = dst.get_typenum(); + int reps_typenum = reps.get_typenum(); + int cumsum_typenum = cumsum.get_typenum(); + + auto const &array_types = td_ns::usm_ndarray_types(); + int src_typeid = array_types.typenum_to_lookup_id(src_typenum); + int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum); + int reps_typeid = array_types.typenum_to_lookup_id(reps_typenum); + int cumsum_typeid = array_types.typenum_to_lookup_id(cumsum_typenum); + + if (src_typeid != dst_typeid) { + throw py::value_error( + "Destination array must have the same elemental data type"); + } + + static constexpr int int64_typeid = + static_cast(td_ns::typenum_t::INT64); + if (cumsum_typeid != int64_typeid) { + throw py::value_error( + "Unexpected data type of `cumsum` array, expecting " + "'int64'"); + } + + if (reps_typeid != cumsum_typeid) { + throw py::value_error("`reps` array must have the same elemental " + "data type as cumsum"); + } + + const char *src_data_p = src.get_data(); + const char *reps_data_p = reps.get_data(); + const char *cumsum_data_p = cumsum.get_data(); + char *dst_data_p = dst.get_data(); + + auto src_shape_vec = src.get_shape_vector(); + auto src_strides_vec = src.get_strides_vector(); + + auto dst_shape_vec = dst.get_shape_vector(); + auto dst_strides_vec = dst.get_strides_vector(); + + auto reps_shape_vec = reps.get_shape_vector(); + auto reps_strides_vec = reps.get_strides_vector(); + + sycl::event repeat_ev; + std::vector host_task_events{}; + if (axis == 0 && src_nd < 2) { + // empty orthogonal directions + + auto fn = repeat_by_sequence_1d_dispatch_vector[src_typeid]; + + assert(dst_shape_vec.size() == 1); + assert(dst_strides_vec.size() == 1); + + if (src_nd == 0) { + src_shape_vec = {0}; + src_strides_vec = {0}; + } + + using dpctl::tensor::offset_utils::device_allocate_and_pack; + auto ptr_size_event_tuple1 = device_allocate_and_pack( + exec_q, host_task_events, src_shape_vec, src_strides_vec); + auto packed_src_shape_strides_owner = + std::move(std::get<0>(ptr_size_event_tuple1)); + sycl::event copy_shapes_strides_ev = std::get<2>(ptr_size_event_tuple1); + const py::ssize_t *packed_src_shape_strides = + packed_src_shape_strides_owner.get(); + + std::vector all_deps; + all_deps.reserve(depends.size() + 1); + all_deps.insert(all_deps.end(), depends.begin(), depends.end()); + all_deps.push_back(copy_shapes_strides_ev); + + assert(all_deps.size() == depends.size() + 1); + + repeat_ev = + fn(exec_q, src_axis_nelems, src_data_p, dst_data_p, reps_data_p, + cumsum_data_p, src_nd, packed_src_shape_strides, + dst_shape_vec[0], dst_strides_vec[0], reps_shape_vec[0], + reps_strides_vec[0], all_deps); + + sycl::event cleanup_tmp_allocations_ev = + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {repeat_ev}, packed_src_shape_strides_owner); + host_task_events.push_back(cleanup_tmp_allocations_ev); + } + else { + // non-empty orthogonal directions + + auto fn = repeat_by_sequence_dispatch_vector[src_typeid]; + + int orthog_nd = src_nd - 1; + + using shT = std::vector; + shT orthog_src_shape; + shT orthog_src_strides; + shT axis_src_shape; + shT axis_src_stride; + split_iteration_space(src_shape_vec, src_strides_vec, axis, axis + 1, + orthog_src_shape, axis_src_shape, + orthog_src_strides, axis_src_stride); + + shT orthog_dst_shape; + shT orthog_dst_strides; + shT axis_dst_shape; + shT axis_dst_stride; + split_iteration_space(dst_shape_vec, dst_strides_vec, axis, axis + 1, + orthog_dst_shape, axis_dst_shape, + orthog_dst_strides, axis_dst_stride); + + assert(orthog_src_shape.size() == static_cast(orthog_nd)); + assert(orthog_dst_shape.size() == static_cast(orthog_nd)); + assert(std::equal(orthog_src_shape.begin(), orthog_src_shape.end(), + orthog_dst_shape.begin())); + + shT simplified_orthog_shape; + shT simplified_orthog_src_strides; + shT simplified_orthog_dst_strides; + + const py::ssize_t *_shape = orthog_src_shape.data(); + + py::ssize_t orthog_src_offset(0); + py::ssize_t orthog_dst_offset(0); + simplify_iteration_space( + orthog_nd, _shape, orthog_src_strides, orthog_dst_strides, + // output + simplified_orthog_shape, simplified_orthog_src_strides, + simplified_orthog_dst_strides, orthog_src_offset, + orthog_dst_offset); + + using dpctl::tensor::offset_utils::device_allocate_and_pack; + auto ptr_size_event_tuple1 = device_allocate_and_pack( + exec_q, host_task_events, simplified_orthog_shape, + simplified_orthog_src_strides, simplified_orthog_dst_strides); + auto packed_shapes_strides_owner = + std::move(std::get<0>(ptr_size_event_tuple1)); + sycl::event copy_shapes_strides_ev = std::get<2>(ptr_size_event_tuple1); + const py::ssize_t *packed_shapes_strides = + packed_shapes_strides_owner.get(); + + std::vector all_deps; + all_deps.reserve(depends.size() + 1); + all_deps.insert(all_deps.end(), depends.begin(), depends.end()); + all_deps.push_back(copy_shapes_strides_ev); + + assert(all_deps.size() == depends.size() + 1); + + repeat_ev = fn(exec_q, orthog_nelems, src_axis_nelems, src_data_p, + dst_data_p, reps_data_p, cumsum_data_p, + // data to build orthog indexer + orthog_nd, packed_shapes_strides, orthog_src_offset, + orthog_dst_offset, + // data to build indexers along repeated axis in src + axis_src_shape[0], axis_src_stride[0], + // data to build indexer along repeated axis in dst + axis_dst_shape[0], axis_dst_stride[0], + // data to build indexer for reps array + reps_shape_vec[0], reps_strides_vec[0], all_deps); + + sycl::event cleanup_tmp_allocations_ev = + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {repeat_ev}, packed_shapes_strides_owner); + host_task_events.push_back(cleanup_tmp_allocations_ev); + } + + sycl::event py_obj_management_host_task_ev = dpctl::utils::keep_args_alive( + exec_q, {src, reps, cumsum, dst}, host_task_events); + + return std::make_pair(py_obj_management_host_task_ev, repeat_ev); +} + +std::pair + py_repeat_by_sequence(const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &dst, + const dpctl::tensor::usm_ndarray &reps, + const dpctl::tensor::usm_ndarray &cumsum, + sycl::queue &exec_q, + const std::vector &depends) +{ + + int dst_nd = dst.get_ndim(); + if (dst_nd != 1) { + throw py::value_error( + "`dst` array must be 1-dimensional when repeating a full array"); + } + + int reps_nd = reps.get_ndim(); + if (reps_nd != 1) { + throw py::value_error("`reps` array must be 1-dimensional"); + } + + if (cumsum.get_ndim() != 1) { + throw py::value_error("`cumsum` array must be 1-dimensional."); + } + + if (!cumsum.is_c_contiguous()) { + throw py::value_error("Expecting `cumsum` array to be C-contiguous."); + } + + if (!dpctl::utils::queues_are_compatible(exec_q, + {src, reps, cumsum, dst})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst); + + std::size_t src_sz = src.get_size(); + std::size_t reps_sz = reps.get_size(); + std::size_t cumsum_sz = cumsum.get_size(); + + // shape at repeated axis must be equal to the sum of reps + if (src_sz != reps_sz || src_sz != cumsum_sz) { + throw py::value_error("Inconsistent array dimensions"); + } + + if (src_sz == 0) { + return std::make_pair(sycl::event(), sycl::event()); + } + + dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(dst, + dst.get_size()); + + auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); + // check that dst does not intersect with src, cumsum, or reps + if (overlap(dst, src) || overlap(dst, reps) || overlap(dst, cumsum)) { + throw py::value_error("Destination array overlaps with inputs"); + } + + int src_typenum = src.get_typenum(); + int dst_typenum = dst.get_typenum(); + int reps_typenum = reps.get_typenum(); + int cumsum_typenum = cumsum.get_typenum(); + + auto const &array_types = td_ns::usm_ndarray_types(); + int src_typeid = array_types.typenum_to_lookup_id(src_typenum); + int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum); + int reps_typeid = array_types.typenum_to_lookup_id(reps_typenum); + int cumsum_typeid = array_types.typenum_to_lookup_id(cumsum_typenum); + + if (src_typeid != dst_typeid) { + throw py::value_error( + "Destination array must have the same elemental data type"); + } + + static constexpr int int64_typeid = + static_cast(td_ns::typenum_t::INT64); + if (cumsum_typeid != int64_typeid) { + throw py::value_error( + "Unexpected data type of `cumsum` array, expecting " + "'int64'"); + } + + if (reps_typeid != cumsum_typeid) { + throw py::value_error("`reps` array must have the same elemental " + "data type as cumsum"); + } + + const char *src_data_p = src.get_data(); + const char *reps_data_p = reps.get_data(); + const char *cumsum_data_p = cumsum.get_data(); + char *dst_data_p = dst.get_data(); + + int src_nd = src.get_ndim(); + auto src_shape_vec = src.get_shape_vector(); + auto src_strides_vec = src.get_strides_vector(); + if (src_nd == 0) { + src_shape_vec = {0}; + src_strides_vec = {0}; + } + + auto dst_shape_vec = dst.get_shape_vector(); + auto dst_strides_vec = dst.get_strides_vector(); + + auto reps_shape_vec = reps.get_shape_vector(); + auto reps_strides_vec = reps.get_strides_vector(); + + std::vector host_task_events{}; + + auto fn = repeat_by_sequence_1d_dispatch_vector[src_typeid]; + + using dpctl::tensor::offset_utils::device_allocate_and_pack; + auto ptr_size_event_tuple1 = device_allocate_and_pack( + exec_q, host_task_events, src_shape_vec, src_strides_vec); + auto packed_src_shapes_strides_owner = + std::move(std::get<0>(ptr_size_event_tuple1)); + sycl::event copy_shapes_strides_ev = std::get<2>(ptr_size_event_tuple1); + const py::ssize_t *packed_src_shapes_strides = + packed_src_shapes_strides_owner.get(); + + std::vector all_deps; + all_deps.reserve(depends.size() + 1); + all_deps.insert(all_deps.end(), depends.begin(), depends.end()); + all_deps.push_back(copy_shapes_strides_ev); + + assert(all_deps.size() == depends.size() + 1); + + sycl::event repeat_ev = fn( + exec_q, src_sz, src_data_p, dst_data_p, reps_data_p, cumsum_data_p, + src_nd, packed_src_shapes_strides, dst_shape_vec[0], dst_strides_vec[0], + reps_shape_vec[0], reps_strides_vec[0], all_deps); + + sycl::event cleanup_tmp_allocations_ev = + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {repeat_ev}, packed_src_shapes_strides_owner); + host_task_events.push_back(cleanup_tmp_allocations_ev); + + sycl::event py_obj_management_host_task_ev = dpctl::utils::keep_args_alive( + exec_q, {src, reps, cumsum, dst}, host_task_events); + + return std::make_pair(py_obj_management_host_task_ev, repeat_ev); +} + +std::pair + py_repeat_by_scalar(const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &dst, + const py::ssize_t reps, + int axis, + sycl::queue &exec_q, + const std::vector &depends) +{ + int src_nd = src.get_ndim(); + if (axis < 0 || (axis + 1 > src_nd && src_nd > 0) || + (axis > 0 && src_nd == 0)) { + throw py::value_error("Specified axis is invalid."); + } + + int dst_nd = dst.get_ndim(); + if ((src_nd != dst_nd && src_nd > 0) || (src_nd == 0 && dst_nd > 1)) { + throw py::value_error("Number of dimensions of source and destination " + "arrays is not consistent"); + } + + if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst); + + const py::ssize_t *src_shape = src.get_shape_raw(); + const py::ssize_t *dst_shape = dst.get_shape_raw(); + bool same_orthog_dims(true); + std::size_t orthog_nelems(1); // number of orthogonal iterations + for (auto i = 0; i < axis; ++i) { + auto src_sh_i = src_shape[i]; + orthog_nelems *= src_sh_i; + same_orthog_dims = same_orthog_dims && (src_sh_i == dst_shape[i]); + } + for (auto i = axis + 1; i < src_nd; ++i) { + auto src_sh_i = src_shape[i]; + orthog_nelems *= src_sh_i; + same_orthog_dims = same_orthog_dims && (src_sh_i == dst_shape[i]); + } + + std::size_t src_axis_nelems(1); + if (src_nd > 0) { + src_axis_nelems = src_shape[axis]; + } + std::size_t dst_axis_nelems(dst_shape[axis]); + + // shape at repeated axis must be equal to the shape of src at the axis * + // reps + if (!same_orthog_dims || (src_axis_nelems * reps) != dst_axis_nelems) { + throw py::value_error("Inconsistent array dimensions"); + } + + if (orthog_nelems == 0 || src_axis_nelems == 0) { + return std::make_pair(sycl::event(), sycl::event()); + } + + dpctl::tensor::validation::AmpleMemory::throw_if_not_ample( + dst, orthog_nelems * (src_axis_nelems * reps)); + + auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); + // check that dst does not intersect with src + if (overlap(dst, src)) { + throw py::value_error("Destination array overlaps with inputs"); + } + + int src_typenum = src.get_typenum(); + int dst_typenum = dst.get_typenum(); + + auto const &array_types = td_ns::usm_ndarray_types(); + int src_typeid = array_types.typenum_to_lookup_id(src_typenum); + int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum); + + if (src_typeid != dst_typeid) { + throw py::value_error( + "Destination array must have the same elemental data type"); + } + + const char *src_data_p = src.get_data(); + char *dst_data_p = dst.get_data(); + + auto src_shape_vec = src.get_shape_vector(); + auto src_strides_vec = src.get_strides_vector(); + + auto dst_shape_vec = dst.get_shape_vector(); + auto dst_strides_vec = dst.get_strides_vector(); + + sycl::event repeat_ev; + std::vector host_task_events{}; + if (axis == 0 && src_nd < 2) { + // empty orthogonal directions + + auto fn = repeat_by_scalar_1d_dispatch_vector[src_typeid]; + + assert(dst_shape_vec.size() == 1); + assert(dst_strides_vec.size() == 1); + + if (src_nd == 0) { + src_shape_vec = {0}; + src_strides_vec = {0}; + } + + using dpctl::tensor::offset_utils::device_allocate_and_pack; + auto ptr_size_event_tuple1 = device_allocate_and_pack( + exec_q, host_task_events, src_shape_vec, src_strides_vec); + auto packed_src_shape_strides_owner = + std::move(std::get<0>(ptr_size_event_tuple1)); + sycl::event copy_shapes_strides_ev = std::get<2>(ptr_size_event_tuple1); + const py::ssize_t *packed_src_shape_strides = + packed_src_shape_strides_owner.get(); + + std::vector all_deps; + all_deps.reserve(depends.size() + 1); + all_deps.insert(all_deps.end(), depends.begin(), depends.end()); + all_deps.push_back(copy_shapes_strides_ev); + + assert(all_deps.size() == depends.size() + 1); + + repeat_ev = fn(exec_q, dst_axis_nelems, src_data_p, dst_data_p, reps, + src_nd, packed_src_shape_strides, dst_shape_vec[0], + dst_strides_vec[0], all_deps); + + sycl::event cleanup_tmp_allocations_ev = + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {repeat_ev}, packed_src_shape_strides_owner); + + host_task_events.push_back(cleanup_tmp_allocations_ev); + } + else { + // non-empty orthogonal directions + + auto fn = repeat_by_scalar_dispatch_vector[src_typeid]; + + int orthog_nd = src_nd - 1; + + using shT = std::vector; + shT orthog_src_shape; + shT orthog_src_strides; + shT axis_src_shape; + shT axis_src_stride; + split_iteration_space(src_shape_vec, src_strides_vec, axis, axis + 1, + orthog_src_shape, axis_src_shape, + orthog_src_strides, axis_src_stride); + + shT orthog_dst_shape; + shT orthog_dst_strides; + shT axis_dst_shape; + shT axis_dst_stride; + split_iteration_space(dst_shape_vec, dst_strides_vec, axis, axis + 1, + orthog_dst_shape, axis_dst_shape, + orthog_dst_strides, axis_dst_stride); + + assert(orthog_src_shape.size() == static_cast(orthog_nd)); + assert(orthog_dst_shape.size() == static_cast(orthog_nd)); + assert(std::equal(orthog_src_shape.begin(), orthog_src_shape.end(), + orthog_dst_shape.begin())); + + shT simplified_orthog_shape; + shT simplified_orthog_src_strides; + shT simplified_orthog_dst_strides; + + const py::ssize_t *_shape = orthog_src_shape.data(); + + py::ssize_t orthog_src_offset(0); + py::ssize_t orthog_dst_offset(0); + + simplify_iteration_space( + orthog_nd, _shape, orthog_src_strides, orthog_dst_strides, + // output + simplified_orthog_shape, simplified_orthog_src_strides, + simplified_orthog_dst_strides, orthog_src_offset, + orthog_dst_offset); + + using dpctl::tensor::offset_utils::device_allocate_and_pack; + auto ptr_size_event_tuple1 = device_allocate_and_pack( + exec_q, host_task_events, simplified_orthog_shape, + simplified_orthog_src_strides, simplified_orthog_dst_strides); + auto packed_shapes_strides_owner = + std::move(std::get<0>(ptr_size_event_tuple1)); + sycl::event copy_shapes_strides_ev = std::get<2>(ptr_size_event_tuple1); + const py::ssize_t *packed_shapes_strides = + packed_shapes_strides_owner.get(); + + std::vector all_deps; + all_deps.reserve(depends.size() + 1); + all_deps.insert(all_deps.end(), depends.begin(), depends.end()); + all_deps.push_back(copy_shapes_strides_ev); + + assert(all_deps.size() == depends.size() + 1); + + repeat_ev = fn(exec_q, orthog_nelems, dst_axis_nelems, src_data_p, + dst_data_p, reps, + // data to build orthog indexer + orthog_nd, packed_shapes_strides, orthog_src_offset, + orthog_dst_offset, + // data to build indexer along repeated axis in src + axis_src_shape[0], axis_src_stride[0], + // data to build indexer along repeated axis in dst + axis_dst_shape[0], axis_dst_stride[0], all_deps); + + sycl::event cleanup_tmp_allocations_ev = + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {repeat_ev}, packed_shapes_strides_owner); + host_task_events.push_back(cleanup_tmp_allocations_ev); + } + + sycl::event py_obj_management_host_task_ev = + dpctl::utils::keep_args_alive(exec_q, {src, dst}, host_task_events); + + return std::make_pair(py_obj_management_host_task_ev, repeat_ev); +} + +std::pair + py_repeat_by_scalar(const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &dst, + const py::ssize_t reps, + sycl::queue &exec_q, + const std::vector &depends) +{ + int dst_nd = dst.get_ndim(); + if (dst_nd != 1) { + throw py::value_error( + "`dst` array must be 1-dimensional when repeating a full array"); + } + + if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst); + + std::size_t src_sz = src.get_size(); + std::size_t dst_sz = dst.get_size(); + + // shape at repeated axis must be equal to the shape of src at the axis * + // reps + if ((src_sz * reps) != dst_sz) { + throw py::value_error("Inconsistent array dimensions"); + } + + if (src_sz == 0) { + return std::make_pair(sycl::event(), sycl::event()); + } + + dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(dst, + src_sz * reps); + + auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); + // check that dst does not intersect with src + if (overlap(dst, src)) { + throw py::value_error("Destination array overlaps with inputs"); + } + + int src_typenum = src.get_typenum(); + int dst_typenum = dst.get_typenum(); + + auto const &array_types = td_ns::usm_ndarray_types(); + int src_typeid = array_types.typenum_to_lookup_id(src_typenum); + int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum); + + if (src_typeid != dst_typeid) { + throw py::value_error( + "Destination array must have the same elemental data type"); + } + + const char *src_data_p = src.get_data(); + char *dst_data_p = dst.get_data(); + + int src_nd = src.get_ndim(); + auto src_shape_vec = src.get_shape_vector(); + auto src_strides_vec = src.get_strides_vector(); + + if (src_nd == 0) { + src_shape_vec = {0}; + src_strides_vec = {0}; + } + + auto dst_shape_vec = dst.get_shape_vector(); + auto dst_strides_vec = dst.get_strides_vector(); + + std::vector host_task_events{}; + + auto fn = repeat_by_scalar_1d_dispatch_vector[src_typeid]; + + using dpctl::tensor::offset_utils::device_allocate_and_pack; + auto ptr_size_event_tuple1 = device_allocate_and_pack( + exec_q, host_task_events, src_shape_vec, src_strides_vec); + auto packed_src_shape_strides_owner = + std::move(std::get<0>(ptr_size_event_tuple1)); + sycl::event copy_shapes_strides_ev = std::get<2>(ptr_size_event_tuple1); + const py::ssize_t *packed_src_shape_strides = + packed_src_shape_strides_owner.get(); + + std::vector all_deps; + all_deps.reserve(depends.size() + 1); + all_deps.insert(all_deps.end(), depends.begin(), depends.end()); + all_deps.push_back(copy_shapes_strides_ev); + + assert(all_deps.size() == depends.size() + 1); + + sycl::event repeat_ev = fn(exec_q, dst_sz, src_data_p, dst_data_p, reps, + src_nd, packed_src_shape_strides, + dst_shape_vec[0], dst_strides_vec[0], all_deps); + + sycl::event cleanup_tmp_allocations_ev = + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {repeat_ev}, packed_src_shape_strides_owner); + host_task_events.push_back(cleanup_tmp_allocations_ev); + + sycl::event py_obj_management_host_task_ev = + dpctl::utils::keep_args_alive(exec_q, {src, dst}, host_task_events); + + return std::make_pair(py_obj_management_host_task_ev, repeat_ev); +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/repeat.hpp b/dpnp/tensor/libtensor/source/repeat.hpp new file mode 100644 index 000000000000..5835377fb29c --- /dev/null +++ b/dpnp/tensor/libtensor/source/repeat.hpp @@ -0,0 +1,83 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===--------------------------------------------------------------------===// + +#pragma once +#include +#include + +#include + +#include "dpnp4pybind11.hpp" +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern void init_repeat_dispatch_vectors(void); + +extern std::pair + py_repeat_by_sequence(const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &dst, + const dpctl::tensor::usm_ndarray &reps, + const dpctl::tensor::usm_ndarray &cumsum, + int axis, + sycl::queue &exec_q, + const std::vector &depends); + +extern std::pair + py_repeat_by_sequence(const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &dst, + const dpctl::tensor::usm_ndarray &reps, + const dpctl::tensor::usm_ndarray &cumsum, + sycl::queue &exec_q, + const std::vector &depends); + +extern std::pair + py_repeat_by_scalar(const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &dst, + const py::ssize_t reps, + int axis, + sycl::queue &exec_q, + const std::vector &depends); + +extern std::pair + py_repeat_by_scalar(const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &dst, + const py::ssize_t reps, + sycl::queue &exec_q, + const std::vector &depends); + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/simplify_iteration_space.cpp b/dpnp/tensor/libtensor/source/simplify_iteration_space.cpp new file mode 100644 index 000000000000..573aaeb0a60b --- /dev/null +++ b/dpnp/tensor/libtensor/source/simplify_iteration_space.cpp @@ -0,0 +1,542 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===--------------------------------------------------------------------===// + +#include +#include +#include +#include +#include + +#include + +#include "simplify_iteration_space.hpp" +#include "utils/strided_iters.hpp" + +namespace dpctl::tensor::py_internal +{ + +namespace py = pybind11; + +void simplify_iteration_space_1(int &nd, + const py::ssize_t *const &shape, + std::vector const &strides, + // output + std::vector &simplified_shape, + std::vector &simplified_strides, + py::ssize_t &offset) +{ + using dpctl::tensor::strides::simplify_iteration_stride; + if (nd > 1) { + // Simplify iteration space to reduce dimensionality + // and improve access pattern + simplified_shape.reserve(nd); + simplified_shape.insert(std::end(simplified_shape), shape, shape + nd); + + simplified_strides.reserve(nd); + simplified_strides.insert(std::end(simplified_strides), + std::begin(strides), std::end(strides)); + + assert(simplified_shape.size() == static_cast(nd)); + assert(simplified_strides.size() == static_cast(nd)); + int contracted_nd = simplify_iteration_stride( + nd, simplified_shape.data(), simplified_strides.data(), + offset // modified by reference + ); + simplified_shape.resize(contracted_nd); + simplified_strides.resize(contracted_nd); + + nd = contracted_nd; + } + else if (nd == 1) { + offset = 0; + // Populate vectors + simplified_shape.reserve(nd); + simplified_shape.push_back(shape[0]); + + simplified_strides.reserve(nd); + simplified_strides.push_back((strides[0] >= 0) ? strides[0] + : -strides[0]); + if ((strides[0] < 0) && (shape[0] > 1)) { + offset += (shape[0] - 1) * strides[0]; + } + + assert(simplified_shape.size() == static_cast(nd)); + assert(simplified_strides.size() == static_cast(nd)); + } +} + +void simplify_iteration_space(int &nd, + const py::ssize_t *const &shape, + std::vector const &src_strides, + std::vector const &dst_strides, + // output + std::vector &simplified_shape, + std::vector &simplified_src_strides, + std::vector &simplified_dst_strides, + py::ssize_t &src_offset, + py::ssize_t &dst_offset) +{ + using dpctl::tensor::strides::simplify_iteration_two_strides; + if (nd > 1) { + // Simplify iteration space to reduce dimensionality + // and improve access pattern + simplified_shape.reserve(nd); + simplified_shape.insert(std::begin(simplified_shape), shape, + shape + nd); + assert(simplified_shape.size() == static_cast(nd)); + + simplified_src_strides.reserve(nd); + simplified_src_strides.insert(std::end(simplified_src_strides), + std::begin(src_strides), + std::end(src_strides)); + assert(simplified_src_strides.size() == static_cast(nd)); + + simplified_dst_strides.reserve(nd); + simplified_dst_strides.insert(std::end(simplified_dst_strides), + std::begin(dst_strides), + std::end(dst_strides)); + assert(simplified_dst_strides.size() == static_cast(nd)); + + int contracted_nd = simplify_iteration_two_strides( + nd, simplified_shape.data(), simplified_src_strides.data(), + simplified_dst_strides.data(), + src_offset, // modified by reference + dst_offset // modified by reference + ); + simplified_shape.resize(contracted_nd); + simplified_src_strides.resize(contracted_nd); + simplified_dst_strides.resize(contracted_nd); + + nd = contracted_nd; + } + else if (nd == 1) { + src_offset = 0; + dst_offset = 0; + // Populate vectors + simplified_shape.reserve(nd); + simplified_shape.push_back(shape[0]); + assert(simplified_shape.size() == static_cast(nd)); + + simplified_src_strides.reserve(nd); + simplified_dst_strides.reserve(nd); + + if (src_strides[0] < 0 && dst_strides[0] < 0) { + simplified_src_strides.push_back(-src_strides[0]); + simplified_dst_strides.push_back(-dst_strides[0]); + if (shape[0] > 1) { + src_offset += (shape[0] - 1) * src_strides[0]; + dst_offset += (shape[0] - 1) * dst_strides[0]; + } + } + else { + simplified_src_strides.push_back(src_strides[0]); + simplified_dst_strides.push_back(dst_strides[0]); + } + + assert(simplified_src_strides.size() == static_cast(nd)); + assert(simplified_dst_strides.size() == static_cast(nd)); + } +} + +void simplify_iteration_space_3( + int &nd, + const py::ssize_t *const &shape, + // src1 + std::vector const &src1_strides, + // src2 + std::vector const &src2_strides, + // dst + std::vector const &dst_strides, + // output + std::vector &simplified_shape, + std::vector &simplified_src1_strides, + std::vector &simplified_src2_strides, + std::vector &simplified_dst_strides, + py::ssize_t &src1_offset, + py::ssize_t &src2_offset, + py::ssize_t &dst_offset) +{ + using dpctl::tensor::strides::simplify_iteration_three_strides; + if (nd > 1) { + // Simplify iteration space to reduce dimensionality + // and improve access pattern + simplified_shape.reserve(nd); + simplified_shape.insert(std::end(simplified_shape), shape, shape + nd); + assert(simplified_shape.size() == static_cast(nd)); + + simplified_src1_strides.reserve(nd); + simplified_src1_strides.insert(std::end(simplified_src1_strides), + std::begin(src1_strides), + std::end(src1_strides)); + assert(simplified_src1_strides.size() == static_cast(nd)); + + simplified_src2_strides.reserve(nd); + simplified_src2_strides.insert(std::end(simplified_src2_strides), + std::begin(src2_strides), + std::end(src2_strides)); + assert(simplified_src2_strides.size() == static_cast(nd)); + + simplified_dst_strides.reserve(nd); + simplified_dst_strides.insert(std::end(simplified_dst_strides), + std::begin(dst_strides), + std::end(dst_strides)); + assert(simplified_dst_strides.size() == static_cast(nd)); + + int contracted_nd = simplify_iteration_three_strides( + nd, simplified_shape.data(), simplified_src1_strides.data(), + simplified_src2_strides.data(), simplified_dst_strides.data(), + src1_offset, // modified by reference + src2_offset, // modified by reference + dst_offset // modified by reference + ); + simplified_shape.resize(contracted_nd); + simplified_src1_strides.resize(contracted_nd); + simplified_src2_strides.resize(contracted_nd); + simplified_dst_strides.resize(contracted_nd); + + nd = contracted_nd; + } + else if (nd == 1) { + src1_offset = 0; + src2_offset = 0; + dst_offset = 0; + // Populate vectors + simplified_shape.reserve(nd); + simplified_shape.push_back(shape[0]); + assert(simplified_shape.size() == static_cast(nd)); + + simplified_src1_strides.reserve(nd); + simplified_src2_strides.reserve(nd); + simplified_dst_strides.reserve(nd); + + if ((src1_strides[0] < 0) && (src2_strides[0] < 0) && + (dst_strides[0] < 0)) { + simplified_src1_strides.push_back(-src1_strides[0]); + simplified_src2_strides.push_back(-src2_strides[0]); + simplified_dst_strides.push_back(-dst_strides[0]); + if (shape[0] > 1) { + src1_offset += src1_strides[0] * (shape[0] - 1); + src2_offset += src2_strides[0] * (shape[0] - 1); + dst_offset += dst_strides[0] * (shape[0] - 1); + } + } + else { + simplified_src1_strides.push_back(src1_strides[0]); + simplified_src2_strides.push_back(src2_strides[0]); + simplified_dst_strides.push_back(dst_strides[0]); + } + + assert(simplified_src1_strides.size() == static_cast(nd)); + assert(simplified_src2_strides.size() == static_cast(nd)); + assert(simplified_dst_strides.size() == static_cast(nd)); + } +} + +void simplify_iteration_space_4( + int &nd, + const py::ssize_t *const &shape, + // src1 + std::vector const &src1_strides, + // src2 + std::vector const &src2_strides, + // src3 + std::vector const &src3_strides, + // dst + std::vector const &dst_strides, + // output + std::vector &simplified_shape, + std::vector &simplified_src1_strides, + std::vector &simplified_src2_strides, + std::vector &simplified_src3_strides, + std::vector &simplified_dst_strides, + py::ssize_t &src1_offset, + py::ssize_t &src2_offset, + py::ssize_t &src3_offset, + py::ssize_t &dst_offset) +{ + using dpctl::tensor::strides::simplify_iteration_four_strides; + if (nd > 1) { + // Simplify iteration space to reduce dimensionality + // and improve access pattern + simplified_shape.reserve(nd); + simplified_shape.insert(std::end(simplified_shape), shape, shape + nd); + assert(simplified_shape.size() == static_cast(nd)); + + simplified_src1_strides.reserve(nd); + simplified_src1_strides.insert(std::end(simplified_src1_strides), + std::begin(src1_strides), + std::end(src1_strides)); + assert(simplified_src1_strides.size() == static_cast(nd)); + + simplified_src2_strides.reserve(nd); + simplified_src2_strides.insert(std::end(simplified_src2_strides), + std::begin(src2_strides), + std::end(src2_strides)); + assert(simplified_src2_strides.size() == static_cast(nd)); + + simplified_src3_strides.reserve(nd); + simplified_src3_strides.insert(std::end(simplified_src3_strides), + std::begin(src3_strides), + std::end(src3_strides)); + assert(simplified_src3_strides.size() == static_cast(nd)); + + simplified_dst_strides.reserve(nd); + simplified_dst_strides.insert(std::end(simplified_dst_strides), + std::begin(dst_strides), + std::end(dst_strides)); + assert(simplified_dst_strides.size() == static_cast(nd)); + + int contracted_nd = simplify_iteration_four_strides( + nd, simplified_shape.data(), simplified_src1_strides.data(), + simplified_src2_strides.data(), simplified_src3_strides.data(), + simplified_dst_strides.data(), + src1_offset, // modified by reference + src2_offset, // modified by reference + src3_offset, // modified by reference + dst_offset // modified by reference + ); + simplified_shape.resize(contracted_nd); + simplified_src1_strides.resize(contracted_nd); + simplified_src2_strides.resize(contracted_nd); + simplified_src3_strides.resize(contracted_nd); + simplified_dst_strides.resize(contracted_nd); + + nd = contracted_nd; + } + else if (nd == 1) { + src1_offset = 0; + src2_offset = 0; + src3_offset = 0; + dst_offset = 0; + // Populate vectors + simplified_shape.reserve(nd); + simplified_shape.push_back(shape[0]); + assert(simplified_shape.size() == static_cast(nd)); + + simplified_src1_strides.reserve(nd); + simplified_src2_strides.reserve(nd); + simplified_src3_strides.reserve(nd); + simplified_dst_strides.reserve(nd); + + if ((src1_strides[0] < 0) && (src2_strides[0] < 0) && + (src3_strides[0] < 0) && (dst_strides[0] < 0)) { + simplified_src1_strides.push_back(-src1_strides[0]); + simplified_src2_strides.push_back(-src2_strides[0]); + simplified_src3_strides.push_back(-src3_strides[0]); + simplified_dst_strides.push_back(-dst_strides[0]); + if (shape[0] > 1) { + src1_offset += src1_strides[0] * (shape[0] - 1); + src2_offset += src2_strides[0] * (shape[0] - 1); + src3_offset += src3_strides[0] * (shape[0] - 1); + dst_offset += dst_strides[0] * (shape[0] - 1); + } + } + else { + simplified_src1_strides.push_back(src1_strides[0]); + simplified_src2_strides.push_back(src2_strides[0]); + simplified_src3_strides.push_back(src3_strides[0]); + simplified_dst_strides.push_back(dst_strides[0]); + } + + assert(simplified_src1_strides.size() == static_cast(nd)); + assert(simplified_src2_strides.size() == static_cast(nd)); + assert(simplified_src3_strides.size() == static_cast(nd)); + assert(simplified_dst_strides.size() == static_cast(nd)); + } +} + +void compact_iteration_space(int &nd, + const py::ssize_t *const &shape, + std::vector const &strides, + // output + std::vector &compact_shape, + std::vector &compact_strides) +{ + using dpctl::tensor::strides::compact_iteration; + if (nd > 1) { + // Compact iteration space to reduce dimensionality + // and improve access pattern + compact_shape.reserve(nd); + compact_shape.insert(std::begin(compact_shape), shape, shape + nd); + assert(compact_shape.size() == static_cast(nd)); + + compact_strides.reserve(nd); + compact_strides.insert(std::end(compact_strides), std::begin(strides), + std::end(strides)); + assert(compact_strides.size() == static_cast(nd)); + + int contracted_nd = + compact_iteration(nd, compact_shape.data(), compact_strides.data()); + compact_shape.resize(contracted_nd); + compact_strides.resize(contracted_nd); + + nd = contracted_nd; + } + else if (nd == 1) { + // Populate vectors + compact_shape.reserve(nd); + compact_shape.push_back(shape[0]); + assert(compact_shape.size() == static_cast(nd)); + + compact_strides.reserve(nd); + compact_strides.push_back(strides[0]); + assert(compact_strides.size() == static_cast(nd)); + } +} + +/* @brief Split shape/strides into dir1 (complementary to axis_start <= i < + * axis_end) and dir2 (along given set of axes) + */ +void split_iteration_space(const std::vector &shape_vec, + const std::vector &strides_vec, + int axis_start, + int axis_end, + std::vector &dir1_shape_vec, + std::vector &dir2_shape_vec, + std::vector &dir1_strides_vec, + std::vector &dir2_strides_vec) +{ + int nd = static_cast(shape_vec.size()); + int dir2_sz = axis_end - axis_start; + int dir1_sz = nd - dir2_sz; + + assert(dir1_sz > 0); + assert(dir2_sz > 0); + + dir1_shape_vec.resize(dir1_sz); + dir2_shape_vec.resize(dir2_sz); + + std::copy(shape_vec.begin(), shape_vec.begin() + axis_start, + dir1_shape_vec.begin()); + std::copy(shape_vec.begin() + axis_end, shape_vec.end(), + dir1_shape_vec.begin() + axis_start); + + std::copy(shape_vec.begin() + axis_start, shape_vec.begin() + axis_end, + dir2_shape_vec.begin()); + + dir1_strides_vec.resize(dir1_sz); + dir2_strides_vec.resize(dir2_sz); + + std::copy(strides_vec.begin(), strides_vec.begin() + axis_start, + dir1_strides_vec.begin()); + std::copy(strides_vec.begin() + axis_end, strides_vec.end(), + dir1_strides_vec.begin() + axis_start); + + std::copy(strides_vec.begin() + axis_start, strides_vec.begin() + axis_end, + dir2_strides_vec.begin()); + + return; +} + +py::ssize_t _ravel_multi_index_c(std::vector const &mi, + std::vector const &shape) +{ + std::size_t nd = shape.size(); + if (nd != mi.size()) { + throw py::value_error( + "Multi-index and shape vectors must have the same length."); + } + + py::ssize_t flat_index = 0; + py::ssize_t s = 1; + for (std::size_t i = 0; i < nd; ++i) { + flat_index += mi.at(nd - 1 - i) * s; + s *= shape.at(nd - 1 - i); + } + + return flat_index; +} + +py::ssize_t _ravel_multi_index_f(std::vector const &mi, + std::vector const &shape) +{ + std::size_t nd = shape.size(); + if (nd != mi.size()) { + throw py::value_error( + "Multi-index and shape vectors must have the same length."); + } + + py::ssize_t flat_index = 0; + py::ssize_t s = 1; + for (std::size_t i = 0; i < nd; ++i) { + flat_index += mi.at(i) * s; + s *= shape.at(i); + } + + return flat_index; +} + +std::vector _unravel_index_c(py::ssize_t flat_index, + std::vector const &shape) +{ + std::size_t nd = shape.size(); + std::vector mi; + mi.resize(nd); + + py::ssize_t i_ = flat_index; + for (std::size_t dim = 0; dim + 1 < nd; ++dim) { + const py::ssize_t si = shape[nd - 1 - dim]; + const py::ssize_t q = i_ / si; + const py::ssize_t r = (i_ - q * si); + mi[nd - 1 - dim] = r; + i_ = q; + } + if (nd) { + mi[0] = i_; + } + return mi; +} + +std::vector _unravel_index_f(py::ssize_t flat_index, + std::vector const &shape) +{ + std::size_t nd = shape.size(); + std::vector mi; + mi.resize(nd); + + py::ssize_t i_ = flat_index; + for (std::size_t dim = 0; dim + 1 < nd; ++dim) { + const py::ssize_t si = shape[dim]; + const py::ssize_t q = i_ / si; + const py::ssize_t r = (i_ - q * si); + mi[dim] = r; + i_ = q; + } + if (nd) { + mi[nd - 1] = i_; + } + return mi; +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/simplify_iteration_space.hpp b/dpnp/tensor/libtensor/source/simplify_iteration_space.hpp new file mode 100644 index 000000000000..acbc833157d1 --- /dev/null +++ b/dpnp/tensor/libtensor/source/simplify_iteration_space.hpp @@ -0,0 +1,125 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===--------------------------------------------------------------------===// + +#pragma once +#include +#include + +namespace dpctl::tensor::py_internal +{ + +namespace py = pybind11; + +void simplify_iteration_space_1(int &, + const py::ssize_t *const &, + std::vector const &, + std::vector &, + std::vector &, + py::ssize_t &); + +void simplify_iteration_space(int &, + const py::ssize_t *const &, + std::vector const &, + std::vector const &, + std::vector &, + std::vector &, + std::vector &, + py::ssize_t &, + py::ssize_t &); + +void simplify_iteration_space_3(int &, + const py::ssize_t *const &, + // src1 + std::vector const &, + // src2 + std::vector const &, + // dst + std::vector const &, + // output + std::vector &, + std::vector &, + std::vector &, + std::vector &, + py::ssize_t &, + py::ssize_t &, + py::ssize_t &); + +void simplify_iteration_space_4(int &, + const py::ssize_t *const &, + // src1 + std::vector const &, + // src2 + std::vector const &, + // src3 + std::vector const &, + // dst + std::vector const &, + // output + std::vector &, + std::vector &, + std::vector &, + std::vector &, + std::vector &, + py::ssize_t &, + py::ssize_t &, + py::ssize_t &, + py::ssize_t &); + +void compact_iteration_space(int &, + const py::ssize_t *const &, + std::vector const &, + // output + std::vector &, + std::vector &); + +void split_iteration_space(const std::vector &, + const std::vector &, + int, + int, + // output + std::vector &, + std::vector &, + std::vector &, + std::vector &); + +py::ssize_t _ravel_multi_index_c(std::vector const &, + std::vector const &); +py::ssize_t _ravel_multi_index_f(std::vector const &, + std::vector const &); +std::vector _unravel_index_c(py::ssize_t, + std::vector const &); +std::vector _unravel_index_f(py::ssize_t, + std::vector const &); + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/sorting/isin.cpp b/dpnp/tensor/libtensor/source/sorting/isin.cpp new file mode 100644 index 000000000000..f1ae5863bbb9 --- /dev/null +++ b/dpnp/tensor/libtensor/source/sorting/isin.cpp @@ -0,0 +1,325 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_sorting_impl +/// extension. +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include + +#include "kernels/sorting/isin.hpp" +#include "utils/memory_overlap.hpp" +#include "utils/offset_utils.hpp" +#include "utils/output_validation.hpp" +#include "utils/sycl_alloc_utils.hpp" +#include "utils/type_dispatch.hpp" + +#include "simplify_iteration_space.hpp" + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace dpctl::tensor::py_internal +{ +namespace detail +{ + +using dpctl::tensor::kernels::isin_contig_impl_fp_ptr_t; + +static isin_contig_impl_fp_ptr_t + isin_contig_impl_dispatch_vector[td_ns::num_types]; + +template +struct IsinContigFactory +{ + constexpr IsinContigFactory() {} + + fnT get() const + { + using dpctl::tensor::kernels::isin_contig_impl; + return isin_contig_impl; + } +}; + +using dpctl::tensor::kernels::isin_strided_impl_fp_ptr_t; + +static isin_strided_impl_fp_ptr_t + isin_strided_impl_dispatch_vector[td_ns::num_types]; + +template +struct IsinStridedFactory +{ + constexpr IsinStridedFactory() {} + + fnT get() const + { + using dpctl::tensor::kernels::isin_strided_impl; + return isin_strided_impl; + } +}; + +void init_isin_dispatch_vector(void) +{ + + // Contiguous input function dispatch + td_ns::DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(isin_contig_impl_dispatch_vector); + + // Strided input function dispatch + td_ns::DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(isin_strided_impl_dispatch_vector); +} + +} // namespace detail + +/*! @brief search for needle from needles in sorted hay */ +std::pair + py_isin(const dpctl::tensor::usm_ndarray &needles, + const dpctl::tensor::usm_ndarray &hay, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const bool invert, + const std::vector &depends) +{ + const int hay_nd = hay.get_ndim(); + const int needles_nd = needles.get_ndim(); + const int dst_nd = dst.get_ndim(); + + if (hay_nd != 1 || needles_nd != dst_nd) { + throw py::value_error("Array dimensions mismatch"); + } + + // check that needle and dst have the same shape + std::size_t needles_nelems(1); + bool same_shape(true); + + const std::size_t hay_nelems = static_cast(hay.get_shape(0)); + + const py::ssize_t *needles_shape_ptr = needles.get_shape_raw(); + const py::ssize_t *dst_shape_ptr = dst.get_shape_raw(); + + for (int i = 0; (i < needles_nd) && same_shape; ++i) { + const auto needles_sh_i = needles_shape_ptr[i]; + const auto dst_sh_i = dst_shape_ptr[i]; + + same_shape = same_shape && (needles_sh_i == dst_sh_i); + needles_nelems *= static_cast(needles_sh_i); + } + + if (!same_shape) { + throw py::value_error( + "Array of values to search for and array of their " + "dst do not have the same shape"); + } + + // check that dst is ample enough + dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(dst, + needles_nelems); + + // check that dst is writable + dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst); + + // check that queues are compatible + if (!dpctl::utils::queues_are_compatible(exec_q, {hay, needles, dst})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + // if output array overlaps with input arrays, race condition results + auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); + if (overlap(dst, hay) || overlap(dst, needles)) { + throw py::value_error("Destination array overlaps with input."); + } + + const int hay_typenum = hay.get_typenum(); + const int needles_typenum = needles.get_typenum(); + const int dst_typenum = dst.get_typenum(); + + auto const &array_types = td_ns::usm_ndarray_types(); + const int hay_typeid = array_types.typenum_to_lookup_id(hay_typenum); + const int needles_typeid = + array_types.typenum_to_lookup_id(needles_typenum); + const int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum); + + // check hay and needle have the same data-type + if (needles_typeid != hay_typeid) { + throw py::value_error( + "Hay array and needles array must have the same data types"); + } + // check that dst has boolean data type + const auto dst_typenum_t_v = static_cast(dst_typeid); + if (dst_typenum_t_v != td_ns::typenum_t::BOOL) { + throw py::value_error("dst array must have data-type bool"); + } + + if (needles_nelems == 0) { + // Nothing to do + return std::make_pair(sycl::event{}, sycl::event{}); + } + + // if all inputs are contiguous call contiguous implementations + // otherwise call strided implementation + const bool hay_is_c_contig = hay.is_c_contiguous(); + const bool hay_is_f_contig = hay.is_f_contiguous(); + + const bool needles_is_c_contig = needles.is_c_contiguous(); + const bool needles_is_f_contig = needles.is_f_contiguous(); + + const bool dst_is_c_contig = dst.is_c_contiguous(); + const bool dst_is_f_contig = dst.is_f_contiguous(); + + const bool all_c_contig = + (hay_is_c_contig && needles_is_c_contig && dst_is_c_contig); + const bool all_f_contig = + (hay_is_f_contig && needles_is_f_contig && dst_is_f_contig); + + const char *hay_data = hay.get_data(); + const char *needles_data = needles.get_data(); + + char *dst_data = dst.get_data(); + + if (all_c_contig || all_f_contig) { + auto fn = detail::isin_contig_impl_dispatch_vector[hay_typeid]; + + static constexpr py::ssize_t zero_offset(0); + + sycl::event comp_ev = fn(exec_q, invert, hay_nelems, needles_nelems, + hay_data, zero_offset, needles_data, + zero_offset, dst_data, zero_offset, depends); + + return std::make_pair(dpctl::utils::keep_args_alive( + exec_q, {hay, needles, dst}, {comp_ev}), + comp_ev); + } + + // strided case + + const auto &needles_strides = needles.get_strides_vector(); + const auto &dst_strides = dst.get_strides_vector(); + + int simplified_nd = needles_nd; + + using shT = std::vector; + shT simplified_common_shape; + shT simplified_needles_strides; + shT simplified_dst_strides; + py::ssize_t needles_offset(0); + py::ssize_t dst_offset(0); + + if (simplified_nd == 0) { + // needles and dst have same nd + simplified_nd = 1; + simplified_common_shape.push_back(1); + simplified_needles_strides.push_back(0); + simplified_dst_strides.push_back(0); + } + else { + simplify_iteration_space( + // modified by reference + simplified_nd, + // read-only inputs + needles_shape_ptr, needles_strides, dst_strides, + // output, modified by reference + simplified_common_shape, simplified_needles_strides, + simplified_dst_strides, needles_offset, dst_offset); + } + std::vector host_task_events; + host_task_events.reserve(2); + + using dpctl::tensor::offset_utils::device_allocate_and_pack; + + auto ptr_size_event_tuple = device_allocate_and_pack( + exec_q, host_task_events, + // vectors being packed + simplified_common_shape, simplified_needles_strides, + simplified_dst_strides); + auto packed_shape_strides_owner = + std::move(std::get<0>(ptr_size_event_tuple)); + const sycl::event ©_shape_strides_ev = + std::get<2>(ptr_size_event_tuple); + const py::ssize_t *packed_shape_strides = packed_shape_strides_owner.get(); + + std::vector all_deps; + all_deps.reserve(depends.size() + 1); + all_deps.insert(all_deps.end(), depends.begin(), depends.end()); + all_deps.push_back(copy_shape_strides_ev); + + auto strided_fn = detail::isin_strided_impl_dispatch_vector[hay_typeid]; + + if (!strided_fn) { + throw std::runtime_error( + "No implementation for data types of input arrays"); + } + + static constexpr py::ssize_t zero_offset(0); + py::ssize_t hay_step = hay.get_strides_vector()[0]; + + const sycl::event &comp_ev = strided_fn( + exec_q, invert, hay_nelems, needles_nelems, hay_data, zero_offset, + hay_step, needles_data, needles_offset, dst_data, dst_offset, + simplified_nd, packed_shape_strides, all_deps); + + // free packed temporaries + sycl::event temporaries_cleanup_ev = + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {comp_ev}, packed_shape_strides_owner); + + host_task_events.push_back(temporaries_cleanup_ev); + const sycl::event &ht_ev = dpctl::utils::keep_args_alive( + exec_q, {hay, needles, dst}, host_task_events); + + return std::make_pair(ht_ev, comp_ev); +} + +void init_isin_functions(py::module_ m) +{ + detail::init_isin_dispatch_vector(); + + m.def("_isin", &py_isin, py::arg("needles"), py::arg("hay"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("invert"), + py::arg("depends") = py::list()); +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/sorting/isin.hpp b/dpnp/tensor/libtensor/source/sorting/isin.hpp new file mode 100644 index 000000000000..236e8b5898c6 --- /dev/null +++ b/dpnp/tensor/libtensor/source/sorting/isin.hpp @@ -0,0 +1,47 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_sorting_impl +/// extension. +//===----------------------------------------------------------------------===// + +#pragma once + +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern void init_isin_functions(py::module_ m); + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/sorting/merge_argsort.cpp b/dpnp/tensor/libtensor/source/sorting/merge_argsort.cpp new file mode 100644 index 000000000000..11df5cd2ef47 --- /dev/null +++ b/dpnp/tensor/libtensor/source/sorting/merge_argsort.cpp @@ -0,0 +1,155 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_sorting_impl +/// extension. +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include + +#include "utils/rich_comparisons.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/sorting/merge_sort.hpp" +#include "kernels/sorting/sort_impl_fn_ptr_t.hpp" + +#include "merge_argsort.hpp" +#include "py_argsort_common.hpp" + +namespace dpctl::tensor::py_internal +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +using dpctl::tensor::kernels::sort_contig_fn_ptr_t; +static sort_contig_fn_ptr_t + ascending_argsort_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; +static sort_contig_fn_ptr_t + descending_argsort_contig_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +template +struct AscendingArgSortContigFactory +{ + fnT get() + { + if constexpr (std::is_same_v || + std::is_same_v) { + using dpctl::tensor::rich_comparisons::AscendingSorter; + using Comp = typename AscendingSorter::type; + + using dpctl::tensor::kernels::stable_argsort_axis1_contig_impl; + return stable_argsort_axis1_contig_impl; + } + else { + return nullptr; + } + } +}; + +template +struct DescendingArgSortContigFactory +{ + fnT get() + { + if constexpr (std::is_same_v || + std::is_same_v) { + using dpctl::tensor::rich_comparisons::DescendingSorter; + using Comp = typename DescendingSorter::type; + + using dpctl::tensor::kernels::stable_argsort_axis1_contig_impl; + return stable_argsort_axis1_contig_impl; + } + else { + return nullptr; + } + } +}; + +void init_merge_argsort_dispatch_tables(void) +{ + using dpctl::tensor::kernels::sort_contig_fn_ptr_t; + + td_ns::DispatchTableBuilder + dtb1; + dtb1.populate_dispatch_table(ascending_argsort_contig_dispatch_table); + + td_ns::DispatchTableBuilder< + sort_contig_fn_ptr_t, DescendingArgSortContigFactory, td_ns::num_types> + dtb2; + dtb2.populate_dispatch_table(descending_argsort_contig_dispatch_table); +} + +void init_merge_argsort_functions(py::module_ m) +{ + init_merge_argsort_dispatch_tables(); + + auto py_argsort_ascending = [](const dpctl::tensor::usm_ndarray &src, + const int trailing_dims_to_sort, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends) + -> std::pair { + return py_argsort(src, trailing_dims_to_sort, dst, exec_q, depends, + ascending_argsort_contig_dispatch_table); + }; + m.def("_argsort_ascending", py_argsort_ascending, py::arg("src"), + py::arg("trailing_dims_to_sort"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto py_argsort_descending = [](const dpctl::tensor::usm_ndarray &src, + const int trailing_dims_to_sort, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends) + -> std::pair { + return py_argsort(src, trailing_dims_to_sort, dst, exec_q, depends, + descending_argsort_contig_dispatch_table); + }; + m.def("_argsort_descending", py_argsort_descending, py::arg("src"), + py::arg("trailing_dims_to_sort"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + return; +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/sorting/merge_argsort.hpp b/dpnp/tensor/libtensor/source/sorting/merge_argsort.hpp new file mode 100644 index 000000000000..10777b4bc2fd --- /dev/null +++ b/dpnp/tensor/libtensor/source/sorting/merge_argsort.hpp @@ -0,0 +1,47 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_sorting_impl +/// extension. +//===----------------------------------------------------------------------===// + +#pragma once + +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern void init_merge_argsort_functions(py::module_); + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/sorting/merge_sort.cpp b/dpnp/tensor/libtensor/source/sorting/merge_sort.cpp new file mode 100644 index 000000000000..fbd60621b3bb --- /dev/null +++ b/dpnp/tensor/libtensor/source/sorting/merge_sort.cpp @@ -0,0 +1,139 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_sorting_impl +/// extension. +//===----------------------------------------------------------------------===// + +#include +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include + +#include "utils/rich_comparisons.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/sorting/merge_sort.hpp" +#include "kernels/sorting/sort_impl_fn_ptr_t.hpp" + +#include "merge_sort.hpp" +#include "py_sort_common.hpp" + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace dpctl::tensor::py_internal +{ + +using dpctl::tensor::kernels::sort_contig_fn_ptr_t; +static sort_contig_fn_ptr_t + ascending_sort_contig_dispatch_vector[td_ns::num_types]; +static sort_contig_fn_ptr_t + descending_sort_contig_dispatch_vector[td_ns::num_types]; + +template +struct AscendingSortContigFactory +{ + fnT get() + { + using dpctl::tensor::rich_comparisons::AscendingSorter; + using Comp = typename AscendingSorter::type; + + using dpctl::tensor::kernels::stable_sort_axis1_contig_impl; + return stable_sort_axis1_contig_impl; + } +}; + +template +struct DescendingSortContigFactory +{ + fnT get() + { + using dpctl::tensor::rich_comparisons::DescendingSorter; + using Comp = typename DescendingSorter::type; + + using dpctl::tensor::kernels::stable_sort_axis1_contig_impl; + return stable_sort_axis1_contig_impl; + } +}; + +void init_merge_sort_dispatch_vectors(void) +{ + using dpctl::tensor::kernels::sort_contig_fn_ptr_t; + + td_ns::DispatchVectorBuilder + dtv1; + dtv1.populate_dispatch_vector(ascending_sort_contig_dispatch_vector); + + td_ns::DispatchVectorBuilder + dtv2; + dtv2.populate_dispatch_vector(descending_sort_contig_dispatch_vector); +} + +void init_merge_sort_functions(py::module_ m) +{ + init_merge_sort_dispatch_vectors(); + + auto py_sort_ascending = [](const dpctl::tensor::usm_ndarray &src, + const int trailing_dims_to_sort, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends) + -> std::pair { + return py_sort(src, trailing_dims_to_sort, dst, exec_q, depends, + ascending_sort_contig_dispatch_vector); + }; + m.def("_sort_ascending", py_sort_ascending, py::arg("src"), + py::arg("trailing_dims_to_sort"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto py_sort_descending = [](const dpctl::tensor::usm_ndarray &src, + const int trailing_dims_to_sort, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends) + -> std::pair { + return py_sort(src, trailing_dims_to_sort, dst, exec_q, depends, + descending_sort_contig_dispatch_vector); + }; + m.def("_sort_descending", py_sort_descending, py::arg("src"), + py::arg("trailing_dims_to_sort"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + return; +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/sorting/merge_sort.hpp b/dpnp/tensor/libtensor/source/sorting/merge_sort.hpp new file mode 100644 index 000000000000..a6bdd0a4efe9 --- /dev/null +++ b/dpnp/tensor/libtensor/source/sorting/merge_sort.hpp @@ -0,0 +1,47 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_sorting_impl +/// extension. +//===----------------------------------------------------------------------===// + +#pragma once + +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern void init_merge_sort_functions(py::module_); + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/sorting/py_argsort_common.hpp b/dpnp/tensor/libtensor/source/sorting/py_argsort_common.hpp new file mode 100644 index 000000000000..018f3166a0ad --- /dev/null +++ b/dpnp/tensor/libtensor/source/sorting/py_argsort_common.hpp @@ -0,0 +1,183 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_sorting_impl +/// extension. +//===----------------------------------------------------------------------===// + +#pragma once + +#include +#include +#include +#include + +#include + +#include "dpnp4pybind11.hpp" +#include + +#include "utils/memory_overlap.hpp" +#include "utils/output_validation.hpp" +#include "utils/type_dispatch.hpp" + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace dpctl::tensor::py_internal +{ + +template +std::pair + py_argsort(const dpctl::tensor::usm_ndarray &src, + const int trailing_dims_to_sort, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends, + const sorting_contig_impl_fnT &sort_contig_fns) +{ + int src_nd = src.get_ndim(); + int dst_nd = dst.get_ndim(); + if (src_nd != dst_nd) { + throw py::value_error("The input and output arrays must have " + "the same array ranks"); + } + int iteration_nd = src_nd - trailing_dims_to_sort; + if (trailing_dims_to_sort <= 0 || iteration_nd < 0) { + throw py::value_error("Trailing_dim_to_sort must be positive, but no " + "greater than rank of the array being sorted"); + } + + const py::ssize_t *src_shape_ptr = src.get_shape_raw(); + const py::ssize_t *dst_shape_ptr = dst.get_shape_raw(); + + bool same_shapes = true; + std::size_t iter_nelems(1); + + for (int i = 0; same_shapes && (i < iteration_nd); ++i) { + auto src_shape_i = src_shape_ptr[i]; + same_shapes = same_shapes && (src_shape_i == dst_shape_ptr[i]); + iter_nelems *= static_cast(src_shape_i); + } + + std::size_t sort_nelems(1); + for (int i = iteration_nd; same_shapes && (i < src_nd); ++i) { + auto src_shape_i = src_shape_ptr[i]; + same_shapes = same_shapes && (src_shape_i == dst_shape_ptr[i]); + sort_nelems *= static_cast(src_shape_i); + } + + if (!same_shapes) { + throw py::value_error( + "Destination shape does not match the input shape"); + } + + if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst); + + if ((iter_nelems == 0) || (sort_nelems == 0)) { + // Nothing to do + return std::make_pair(sycl::event(), sycl::event()); + } + + // check that dst and src do not overlap + auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); + if (overlap(src, dst)) { + throw py::value_error("Arrays index overlapping segments of memory"); + } + + dpctl::tensor::validation::AmpleMemory::throw_if_not_ample( + dst, sort_nelems * iter_nelems); + + int src_typenum = src.get_typenum(); + int dst_typenum = dst.get_typenum(); + + const auto &array_types = td_ns::usm_ndarray_types(); + int src_typeid = array_types.typenum_to_lookup_id(src_typenum); + int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum); + + if ((dst_typeid != static_cast(td_ns::typenum_t::INT64)) && + (dst_typeid != static_cast(td_ns::typenum_t::INT32))) { + throw py::value_error( + "Output index array must have data type int32 or int64"); + } + + bool is_src_c_contig = src.is_c_contiguous(); + bool is_dst_c_contig = dst.is_c_contiguous(); + + if (is_src_c_contig && is_dst_c_contig) { + if (sort_nelems > 1) { + static constexpr py::ssize_t zero_offset = py::ssize_t(0); + + auto fn = sort_contig_fns[src_typeid][dst_typeid]; + + if (fn == nullptr) { + throw py::value_error( + "Not implemented for dtypes of input arrays"); + } + + sycl::event comp_ev = + fn(exec_q, iter_nelems, sort_nelems, src.get_data(), + dst.get_data(), zero_offset, zero_offset, zero_offset, + zero_offset, depends); + + sycl::event keep_args_alive_ev = + dpctl::utils::keep_args_alive(exec_q, {src, dst}, {comp_ev}); + + return std::make_pair(keep_args_alive_ev, comp_ev); + } + else { + assert(dst.get_size() == iter_nelems); + int dst_elemsize = dst.get_elemsize(); + static constexpr int memset_val(0); + + sycl::event fill_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + cgh.memset(reinterpret_cast(dst.get_data()), memset_val, + iter_nelems * dst_elemsize); + }); + + sycl::event keep_args_alive_ev = + dpctl::utils::keep_args_alive(exec_q, {src, dst}, {fill_ev}); + + return std::make_pair(keep_args_alive_ev, fill_ev); + } + } + + throw py::value_error( + "Both source and destination arrays must be C-contiguous"); +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/sorting/py_sort_common.hpp b/dpnp/tensor/libtensor/source/sorting/py_sort_common.hpp new file mode 100644 index 000000000000..ee8777f35077 --- /dev/null +++ b/dpnp/tensor/libtensor/source/sorting/py_sort_common.hpp @@ -0,0 +1,178 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_sorting_impl +/// extension. +//===----------------------------------------------------------------------===// + +#pragma once + +#include +#include +#include +#include + +#include + +#include "dpnp4pybind11.hpp" +#include + +#include "utils/memory_overlap.hpp" +#include "utils/output_validation.hpp" +#include "utils/type_dispatch.hpp" + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace dpctl::tensor::py_internal +{ + +template +std::pair + py_sort(const dpctl::tensor::usm_ndarray &src, + const int trailing_dims_to_sort, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends, + const sorting_contig_impl_fnT &sort_contig_fns) +{ + int src_nd = src.get_ndim(); + int dst_nd = dst.get_ndim(); + if (src_nd != dst_nd) { + throw py::value_error("The input and output arrays must have " + "the same array ranks"); + } + int iteration_nd = src_nd - trailing_dims_to_sort; + if (trailing_dims_to_sort <= 0 || iteration_nd < 0) { + throw py::value_error("Trailing_dim_to_sort must be positive, but no " + "greater than rank of the array being sorted"); + } + + const py::ssize_t *src_shape_ptr = src.get_shape_raw(); + const py::ssize_t *dst_shape_ptr = dst.get_shape_raw(); + + bool same_shapes = true; + std::size_t iter_nelems(1); + + for (int i = 0; same_shapes && (i < iteration_nd); ++i) { + auto src_shape_i = src_shape_ptr[i]; + same_shapes = same_shapes && (src_shape_i == dst_shape_ptr[i]); + iter_nelems *= static_cast(src_shape_i); + } + + std::size_t sort_nelems(1); + for (int i = iteration_nd; same_shapes && (i < src_nd); ++i) { + auto src_shape_i = src_shape_ptr[i]; + same_shapes = same_shapes && (src_shape_i == dst_shape_ptr[i]); + sort_nelems *= static_cast(src_shape_i); + } + + if (!same_shapes) { + throw py::value_error( + "Destination shape does not match the input shape"); + } + + if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst); + + if ((iter_nelems == 0) || (sort_nelems == 0)) { + // Nothing to do + return std::make_pair(sycl::event(), sycl::event()); + } + + // check that dst and src do not overlap + auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); + if (overlap(src, dst)) { + throw py::value_error("Arrays index overlapping segments of memory"); + } + + dpctl::tensor::validation::AmpleMemory::throw_if_not_ample( + dst, sort_nelems * iter_nelems); + + int src_typenum = src.get_typenum(); + int dst_typenum = dst.get_typenum(); + + const auto &array_types = td_ns::usm_ndarray_types(); + int src_typeid = array_types.typenum_to_lookup_id(src_typenum); + int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum); + + if (src_typeid != dst_typeid) { + throw py::value_error("Both input arrays must have " + "the same value data type"); + } + + bool is_src_c_contig = src.is_c_contiguous(); + bool is_dst_c_contig = dst.is_c_contiguous(); + + if (is_src_c_contig && is_dst_c_contig) { + if (sort_nelems > 1) { + static constexpr py::ssize_t zero_offset = py::ssize_t(0); + + auto fn = sort_contig_fns[src_typeid]; + + if (nullptr == fn) { + throw py::value_error( + "Not implemented for the dtype of input arrays"); + } + + sycl::event comp_ev = + fn(exec_q, iter_nelems, sort_nelems, src.get_data(), + dst.get_data(), zero_offset, zero_offset, zero_offset, + zero_offset, depends); + + sycl::event keep_args_alive_ev = + dpctl::utils::keep_args_alive(exec_q, {src, dst}, {comp_ev}); + + return std::make_pair(keep_args_alive_ev, comp_ev); + } + else { + assert(dst.get_size() == iter_nelems); + int src_elemsize = src.get_elemsize(); + + sycl::event copy_ev = + exec_q.copy(src.get_data(), dst.get_data(), + src_elemsize * iter_nelems, depends); + + return std::make_pair( + dpctl::utils::keep_args_alive(exec_q, {src, dst}, {copy_ev}), + copy_ev); + } + } + + throw py::value_error( + "Both source and destination arrays must be C-contiguous"); +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/sorting/radix_argsort.cpp b/dpnp/tensor/libtensor/source/sorting/radix_argsort.cpp new file mode 100644 index 000000000000..0eec8fba9ded --- /dev/null +++ b/dpnp/tensor/libtensor/source/sorting/radix_argsort.cpp @@ -0,0 +1,185 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_sorting_impl +/// extension. +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include + +#include "utils/type_dispatch.hpp" + +#include "kernels/dpctl_tensor_types.hpp" +#include "kernels/sorting/radix_sort.hpp" +#include "kernels/sorting/sort_impl_fn_ptr_t.hpp" + +#include "py_argsort_common.hpp" +#include "radix_argsort.hpp" +#include "radix_sort_support.hpp" + +namespace dpctl::tensor::py_internal +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; +namespace impl_ns = dpctl::tensor::kernels::radix_sort_details; + +using dpctl::tensor::ssize_t; +using dpctl::tensor::kernels::sort_contig_fn_ptr_t; + +static sort_contig_fn_ptr_t + ascending_radix_argsort_contig_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static sort_contig_fn_ptr_t + descending_radix_argsort_contig_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +namespace +{ + +template +sycl::event argsort_axis1_contig_caller(sycl::queue &q, + std::size_t iter_nelems, + std::size_t sort_nelems, + const char *arg_cp, + char *res_cp, + ssize_t iter_arg_offset, + ssize_t iter_res_offset, + ssize_t sort_arg_offset, + ssize_t sort_res_offset, + const std::vector &depends) +{ + using dpctl::tensor::kernels::radix_argsort_axis1_contig_impl; + + return radix_argsort_axis1_contig_impl( + q, is_ascending, iter_nelems, sort_nelems, arg_cp, res_cp, + iter_arg_offset, iter_res_offset, sort_arg_offset, sort_res_offset, + depends); +} + +} // end of anonymous namespace + +template +struct AscendingRadixArgSortContigFactory +{ + fnT get() + { + if constexpr (RadixSortSupportVector::is_defined && + (std::is_same_v || + std::is_same_v)) { + return argsort_axis1_contig_caller< + /*ascending*/ true, argTy, IndexTy>; + } + else { + return nullptr; + } + } +}; + +template +struct DescendingRadixArgSortContigFactory +{ + fnT get() + { + if constexpr (RadixSortSupportVector::is_defined && + (std::is_same_v || + std::is_same_v)) { + return argsort_axis1_contig_caller< + /*ascending*/ false, argTy, IndexTy>; + } + else { + return nullptr; + } + } +}; + +void init_radix_argsort_dispatch_tables(void) +{ + using dpctl::tensor::kernels::sort_contig_fn_ptr_t; + + td_ns::DispatchTableBuilder + dtb1; + dtb1.populate_dispatch_table(ascending_radix_argsort_contig_dispatch_table); + + td_ns::DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table( + descending_radix_argsort_contig_dispatch_table); +} + +void init_radix_argsort_functions(py::module_ m) +{ + init_radix_argsort_dispatch_tables(); + + auto py_radix_argsort_ascending = + [](const dpctl::tensor::usm_ndarray &src, + const int trailing_dims_to_sort, + const dpctl::tensor::usm_ndarray &dst, sycl::queue &exec_q, + const std::vector &depends) + -> std::pair { + return py_argsort(src, trailing_dims_to_sort, dst, exec_q, depends, + ascending_radix_argsort_contig_dispatch_table); + }; + m.def("_radix_argsort_ascending", py_radix_argsort_ascending, + py::arg("src"), py::arg("trailing_dims_to_sort"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto py_radix_argsort_descending = + [](const dpctl::tensor::usm_ndarray &src, + const int trailing_dims_to_sort, + const dpctl::tensor::usm_ndarray &dst, sycl::queue &exec_q, + const std::vector &depends) + -> std::pair { + return py_argsort(src, trailing_dims_to_sort, dst, exec_q, depends, + descending_radix_argsort_contig_dispatch_table); + }; + m.def("_radix_argsort_descending", py_radix_argsort_descending, + py::arg("src"), py::arg("trailing_dims_to_sort"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + return; +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/sorting/radix_argsort.hpp b/dpnp/tensor/libtensor/source/sorting/radix_argsort.hpp new file mode 100644 index 000000000000..89013fbb1bdc --- /dev/null +++ b/dpnp/tensor/libtensor/source/sorting/radix_argsort.hpp @@ -0,0 +1,47 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_sorting_impl +/// extension. +//===----------------------------------------------------------------------===// + +#pragma once + +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern void init_radix_argsort_functions(py::module_); + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/sorting/radix_sort.cpp b/dpnp/tensor/libtensor/source/sorting/radix_sort.cpp new file mode 100644 index 000000000000..35c71a0eb7d3 --- /dev/null +++ b/dpnp/tensor/libtensor/source/sorting/radix_sort.cpp @@ -0,0 +1,188 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_sorting_impl +/// extension. +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include + +#include "utils/type_dispatch.hpp" + +#include "kernels/dpctl_tensor_types.hpp" +#include "kernels/sorting/radix_sort.hpp" +#include "kernels/sorting/sort_impl_fn_ptr_t.hpp" + +#include "py_sort_common.hpp" +#include "radix_sort.hpp" +#include "radix_sort_support.hpp" + +namespace dpctl::tensor::py_internal +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; +namespace impl_ns = dpctl::tensor::kernels::radix_sort_details; + +using dpctl::tensor::ssize_t; +using dpctl::tensor::kernels::sort_contig_fn_ptr_t; +static sort_contig_fn_ptr_t + ascending_radix_sort_contig_dispatch_vector[td_ns::num_types]; +static sort_contig_fn_ptr_t + descending_radix_sort_contig_dispatch_vector[td_ns::num_types]; + +namespace +{ + +template +sycl::event sort_axis1_contig_caller(sycl::queue &q, + std::size_t iter_nelems, + std::size_t sort_nelems, + const char *arg_cp, + char *res_cp, + ssize_t iter_arg_offset, + ssize_t iter_res_offset, + ssize_t sort_arg_offset, + ssize_t sort_res_offset, + const std::vector &depends) +{ + using dpctl::tensor::kernels::radix_sort_axis1_contig_impl; + + return radix_sort_axis1_contig_impl( + q, is_ascending, iter_nelems, sort_nelems, arg_cp, res_cp, + iter_arg_offset, iter_res_offset, sort_arg_offset, sort_res_offset, + depends); +} + +} // end of anonymous namespace + +template +struct AscendingRadixSortContigFactory +{ + fnT get() + { + if constexpr (RadixSortSupportVector::is_defined) { + return sort_axis1_contig_caller; + } + else { + return nullptr; + } + } +}; + +template +struct DescendingRadixSortContigFactory +{ + fnT get() + { + if constexpr (RadixSortSupportVector::is_defined) { + return sort_axis1_contig_caller; + } + else { + return nullptr; + } + } +}; + +void init_radix_sort_dispatch_vectors(void) +{ + using dpctl::tensor::kernels::sort_contig_fn_ptr_t; + + td_ns::DispatchVectorBuilder< + sort_contig_fn_ptr_t, AscendingRadixSortContigFactory, td_ns::num_types> + dtv1; + dtv1.populate_dispatch_vector(ascending_radix_sort_contig_dispatch_vector); + + td_ns::DispatchVectorBuilder + dtv2; + dtv2.populate_dispatch_vector(descending_radix_sort_contig_dispatch_vector); +} + +bool py_radix_sort_defined(int typenum) +{ + const auto &array_types = td_ns::usm_ndarray_types(); + + try { + int type_id = array_types.typenum_to_lookup_id(typenum); + return (nullptr != + ascending_radix_sort_contig_dispatch_vector[type_id]); + } catch (const std::exception &e) { + return false; + } +} + +void init_radix_sort_functions(py::module_ m) +{ + init_radix_sort_dispatch_vectors(); + + auto py_radix_sort_ascending = [](const dpctl::tensor::usm_ndarray &src, + const int trailing_dims_to_sort, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends) + -> std::pair { + return py_sort(src, trailing_dims_to_sort, dst, exec_q, depends, + ascending_radix_sort_contig_dispatch_vector); + }; + m.def("_radix_sort_ascending", py_radix_sort_ascending, py::arg("src"), + py::arg("trailing_dims_to_sort"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto py_radix_sort_descending = [](const dpctl::tensor::usm_ndarray &src, + const int trailing_dims_to_sort, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends) + -> std::pair { + return py_sort(src, trailing_dims_to_sort, dst, exec_q, depends, + descending_radix_sort_contig_dispatch_vector); + }; + m.def("_radix_sort_descending", py_radix_sort_descending, py::arg("src"), + py::arg("trailing_dims_to_sort"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + m.def("_radix_sort_dtype_supported", py_radix_sort_defined); + + return; +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/sorting/radix_sort.hpp b/dpnp/tensor/libtensor/source/sorting/radix_sort.hpp new file mode 100644 index 000000000000..5f3c771b464b --- /dev/null +++ b/dpnp/tensor/libtensor/source/sorting/radix_sort.hpp @@ -0,0 +1,47 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_sorting_impl +/// extension. +//===----------------------------------------------------------------------===// + +#pragma once + +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern void init_radix_sort_functions(py::module_); + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/sorting/radix_sort_support.hpp b/dpnp/tensor/libtensor/source/sorting/radix_sort_support.hpp new file mode 100644 index 000000000000..8d7e55a5cd28 --- /dev/null +++ b/dpnp/tensor/libtensor/source/sorting/radix_sort_support.hpp @@ -0,0 +1,78 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_sorting_impl +/// extension. +//===----------------------------------------------------------------------===// + +#pragma once + +#include +#include + +#include + +namespace dpctl::tensor::py_internal +{ + +template +struct TypeDefinedEntry : std::bool_constant> +{ + static constexpr bool is_defined = true; +}; + +struct NotDefinedEntry : std::true_type +{ + static constexpr bool is_defined = false; +}; + +template +struct RadixSortSupportVector +{ + using resolver_t = + typename std::disjunction, + TypeDefinedEntry, + TypeDefinedEntry, + TypeDefinedEntry, + TypeDefinedEntry, + TypeDefinedEntry, + TypeDefinedEntry, + TypeDefinedEntry, + TypeDefinedEntry, + TypeDefinedEntry, + TypeDefinedEntry, + TypeDefinedEntry, + NotDefinedEntry>; + + static constexpr bool is_defined = resolver_t::is_defined; +}; + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/sorting/searchsorted.cpp b/dpnp/tensor/libtensor/source/sorting/searchsorted.cpp new file mode 100644 index 000000000000..6c50b0cbc08c --- /dev/null +++ b/dpnp/tensor/libtensor/source/sorting/searchsorted.cpp @@ -0,0 +1,473 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_sorting_impl +/// extension. +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include + +#include "kernels/sorting/searchsorted.hpp" +#include "utils/memory_overlap.hpp" +#include "utils/offset_utils.hpp" +#include "utils/output_validation.hpp" +#include "utils/rich_comparisons.hpp" +#include "utils/sycl_alloc_utils.hpp" +#include "utils/type_dispatch.hpp" + +#include "simplify_iteration_space.hpp" + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace dpctl::tensor::py_internal +{ + +namespace detail +{ + +using dpctl::tensor::kernels::searchsorted_contig_impl_fp_ptr_t; + +static searchsorted_contig_impl_fp_ptr_t + left_side_searchsorted_contig_impl[td_ns::num_types][td_ns::num_types]; + +static searchsorted_contig_impl_fp_ptr_t + right_side_searchsorted_contig_impl[td_ns::num_types][td_ns::num_types]; + +template +struct LeftSideSearchSortedContigFactory +{ + constexpr LeftSideSearchSortedContigFactory() {} + + fnT get() const + { + if constexpr (std::is_same_v || + std::is_same_v) { + static constexpr bool left_side_search(true); + using dpctl::tensor::kernels::searchsorted_contig_impl; + using dpctl::tensor::rich_comparisons::AscendingSorter; + + using Compare = typename AscendingSorter::type; + + return searchsorted_contig_impl; + } + else { + return nullptr; + } + } +}; + +template +struct RightSideSearchSortedContigFactory +{ + constexpr RightSideSearchSortedContigFactory() {} + + fnT get() const + { + if constexpr (std::is_same_v || + std::is_same_v) { + static constexpr bool right_side_search(false); + + using dpctl::tensor::kernels::searchsorted_contig_impl; + using dpctl::tensor::rich_comparisons::AscendingSorter; + + using Compare = typename AscendingSorter::type; + + return searchsorted_contig_impl; + } + else { + return nullptr; + } + } +}; + +using dpctl::tensor::kernels::searchsorted_strided_impl_fp_ptr_t; + +static searchsorted_strided_impl_fp_ptr_t + left_side_searchsorted_strided_impl[td_ns::num_types][td_ns::num_types]; + +static searchsorted_strided_impl_fp_ptr_t + right_side_searchsorted_strided_impl[td_ns::num_types][td_ns::num_types]; + +template +struct LeftSideSearchSortedStridedFactory +{ + constexpr LeftSideSearchSortedStridedFactory() {} + + fnT get() const + { + if constexpr (std::is_same_v || + std::is_same_v) { + static constexpr bool left_side_search(true); + using dpctl::tensor::kernels::searchsorted_strided_impl; + using dpctl::tensor::rich_comparisons::AscendingSorter; + + using Compare = typename AscendingSorter::type; + + return searchsorted_strided_impl; + } + else { + return nullptr; + } + } +}; + +template +struct RightSideSearchSortedStridedFactory +{ + constexpr RightSideSearchSortedStridedFactory() {} + + fnT get() const + { + if constexpr (std::is_same_v || + std::is_same_v) { + static constexpr bool right_side_search(false); + using dpctl::tensor::kernels::searchsorted_strided_impl; + using dpctl::tensor::rich_comparisons::AscendingSorter; + + using Compare = typename AscendingSorter::type; + + return searchsorted_strided_impl; + } + else { + return nullptr; + } + } +}; + +void init_searchsorted_dispatch_table(void) +{ + + // Contiguous input function dispatch + td_ns::DispatchTableBuilder + dtb1; + dtb1.populate_dispatch_table(left_side_searchsorted_contig_impl); + + td_ns::DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(right_side_searchsorted_contig_impl); + + // Strided input function dispatch + td_ns::DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table(left_side_searchsorted_strided_impl); + + td_ns::DispatchTableBuilder + dtb4; + dtb4.populate_dispatch_table(right_side_searchsorted_strided_impl); +} + +} // namespace detail + +/*! @brief search for needle from needles in sorted hay */ +std::pair + py_searchsorted(const dpctl::tensor::usm_ndarray &hay, + const dpctl::tensor::usm_ndarray &needles, + const dpctl::tensor::usm_ndarray &positions, + sycl::queue &exec_q, + const bool search_left_side, + const std::vector &depends) +{ + const int hay_nd = hay.get_ndim(); + const int needles_nd = needles.get_ndim(); + const int positions_nd = positions.get_ndim(); + + if (hay_nd != 1 || needles_nd != positions_nd) { + throw py::value_error("Array dimensions mismatch"); + } + + // check that needle and positions have the same shape + std::size_t needles_nelems(1); + bool same_shape(true); + + const std::size_t hay_nelems = static_cast(hay.get_shape(0)); + + const py::ssize_t *needles_shape_ptr = needles.get_shape_raw(); + const py::ssize_t *positions_shape_ptr = positions.get_shape_raw(); + + for (int i = 0; (i < needles_nd) && same_shape; ++i) { + const auto needles_sh_i = needles_shape_ptr[i]; + const auto positions_sh_i = positions_shape_ptr[i]; + + same_shape = same_shape && (needles_sh_i == positions_sh_i); + needles_nelems *= static_cast(needles_sh_i); + } + + if (!same_shape) { + throw py::value_error( + "Array of values to search for and array of their " + "positions do not have the same shape"); + } + + // check that positions is ample enough + dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(positions, + needles_nelems); + + // check that positions is writable + dpctl::tensor::validation::CheckWritable::throw_if_not_writable(positions); + + // check that queues are compatible + if (!dpctl::utils::queues_are_compatible(exec_q, + {hay, needles, positions})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + // if output array overlaps with input arrays, race condition results + auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); + if (overlap(positions, hay) || overlap(positions, needles)) { + throw py::value_error("Destination array overlaps with input."); + } + + const int hay_typenum = hay.get_typenum(); + const int needles_typenum = needles.get_typenum(); + const int positions_typenum = positions.get_typenum(); + + auto const &array_types = td_ns::usm_ndarray_types(); + const int hay_typeid = array_types.typenum_to_lookup_id(hay_typenum); + const int needles_typeid = + array_types.typenum_to_lookup_id(needles_typenum); + const int positions_typeid = + array_types.typenum_to_lookup_id(positions_typenum); + + // check hay and needle have the same data-type + if (needles_typeid != hay_typeid) { + throw py::value_error( + "Hay array and needles array must have the same data types"); + } + // check that positions has indexing data-type (int32, or int64) + const auto positions_typenum_t_v = + static_cast(positions_typeid); + if (positions_typenum_t_v != td_ns::typenum_t::INT32 && + positions_typenum_t_v != td_ns::typenum_t::INT64) { + throw py::value_error( + "Positions array must have data-type int32, or int64"); + } + + if (needles_nelems == 0) { + // Nothing to do + return std::make_pair(sycl::event{}, sycl::event{}); + } + + // if all inputs are contiguous call contiguous implementations + // otherwise call strided implementation + const bool hay_is_c_contig = hay.is_c_contiguous(); + const bool hay_is_f_contig = hay.is_f_contiguous(); + + const bool needles_is_c_contig = needles.is_c_contiguous(); + const bool needles_is_f_contig = needles.is_f_contiguous(); + + const bool positions_is_c_contig = positions.is_c_contiguous(); + const bool positions_is_f_contig = positions.is_f_contiguous(); + + const bool all_c_contig = + (hay_is_c_contig && needles_is_c_contig && positions_is_c_contig); + const bool all_f_contig = + (hay_is_f_contig && needles_is_f_contig && positions_is_f_contig); + + const char *hay_data = hay.get_data(); + const char *needles_data = needles.get_data(); + + char *positions_data = positions.get_data(); + + if (all_c_contig || all_f_contig) { + auto fn = + (search_left_side) + ? detail::left_side_searchsorted_contig_impl[hay_typeid] + [positions_typeid] + : detail::right_side_searchsorted_contig_impl[hay_typeid] + [positions_typeid]; + + if (fn) { + static constexpr py::ssize_t zero_offset(0); + + sycl::event comp_ev = + fn(exec_q, hay_nelems, needles_nelems, hay_data, zero_offset, + needles_data, zero_offset, positions_data, zero_offset, + depends); + + return std::make_pair( + dpctl::utils::keep_args_alive(exec_q, {hay, needles, positions}, + {comp_ev}), + comp_ev); + } + } + + // strided case + + const auto &needles_strides = needles.get_strides_vector(); + const auto &positions_strides = positions.get_strides_vector(); + + int simplified_nd = needles_nd; + + using shT = std::vector; + shT simplified_common_shape; + shT simplified_needles_strides; + shT simplified_positions_strides; + py::ssize_t needles_offset(0); + py::ssize_t positions_offset(0); + + if (simplified_nd == 0) { + // needles and positions have same nd + simplified_nd = 1; + simplified_common_shape.push_back(1); + simplified_needles_strides.push_back(0); + simplified_positions_strides.push_back(0); + } + else { + simplify_iteration_space( + // modified by reference + simplified_nd, + // read-only inputs + needles_shape_ptr, needles_strides, positions_strides, + // output, modified by reference + simplified_common_shape, simplified_needles_strides, + simplified_positions_strides, needles_offset, positions_offset); + } + std::vector host_task_events; + host_task_events.reserve(2); + + using dpctl::tensor::offset_utils::device_allocate_and_pack; + + auto ptr_size_event_tuple = device_allocate_and_pack( + exec_q, host_task_events, + // vectors being packed + simplified_common_shape, simplified_needles_strides, + simplified_positions_strides); + auto packed_shape_strides_owner = + std::move(std::get<0>(ptr_size_event_tuple)); + const sycl::event ©_shape_strides_ev = + std::get<2>(ptr_size_event_tuple); + const py::ssize_t *packed_shape_strides = packed_shape_strides_owner.get(); + + std::vector all_deps; + all_deps.reserve(depends.size() + 1); + all_deps.insert(all_deps.end(), depends.begin(), depends.end()); + all_deps.push_back(copy_shape_strides_ev); + + auto strided_fn = + (search_left_side) + ? detail::left_side_searchsorted_strided_impl[hay_typeid] + [positions_typeid] + : detail::right_side_searchsorted_strided_impl[hay_typeid] + [positions_typeid]; + + if (!strided_fn) { + throw std::runtime_error( + "No implementation for data types of input arrays"); + } + + static constexpr py::ssize_t zero_offset(0); + py::ssize_t hay_step = hay.get_strides_vector()[0]; + + const sycl::event &comp_ev = strided_fn( + exec_q, hay_nelems, needles_nelems, hay_data, zero_offset, hay_step, + needles_data, needles_offset, positions_data, positions_offset, + simplified_nd, packed_shape_strides, all_deps); + + // free packed temporaries + sycl::event temporaries_cleanup_ev = + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {comp_ev}, packed_shape_strides_owner); + + host_task_events.push_back(temporaries_cleanup_ev); + const sycl::event &ht_ev = dpctl::utils::keep_args_alive( + exec_q, {hay, needles, positions}, host_task_events); + + return std::make_pair(ht_ev, comp_ev); +} + +/*! @brief search for needle from needles in sorted hay, + * hay[pos] <= needle < hay[pos + 1] + */ +std::pair + py_searchsorted_left(const dpctl::tensor::usm_ndarray &hay, + const dpctl::tensor::usm_ndarray &needles, + const dpctl::tensor::usm_ndarray &positions, + sycl::queue &exec_q, + const std::vector &depends) +{ + static constexpr bool side_left(true); + return py_searchsorted(hay, needles, positions, exec_q, side_left, depends); +} + +/*! @brief search for needle from needles in sorted hay, + * hay[pos] < needle <= hay[pos + 1] + */ +std::pair + py_searchsorted_right(const dpctl::tensor::usm_ndarray &hay, + const dpctl::tensor::usm_ndarray &needles, + const dpctl::tensor::usm_ndarray &positions, + sycl::queue &exec_q, + const std::vector &depends) +{ + static constexpr bool side_right(false); + return py_searchsorted(hay, needles, positions, exec_q, side_right, + depends); +} + +void init_searchsorted_functions(py::module_ m) +{ + detail::init_searchsorted_dispatch_table(); + + m.def("_searchsorted_left", &py_searchsorted_left, py::arg("hay"), + py::arg("needles"), py::arg("positions"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + m.def("_searchsorted_right", &py_searchsorted_right, py::arg("hay"), + py::arg("needles"), py::arg("positions"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/sorting/searchsorted.hpp b/dpnp/tensor/libtensor/source/sorting/searchsorted.hpp new file mode 100644 index 000000000000..b60dae1e0ec9 --- /dev/null +++ b/dpnp/tensor/libtensor/source/sorting/searchsorted.hpp @@ -0,0 +1,47 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_sorting_impl +/// extension. +//===----------------------------------------------------------------------===// + +#pragma once + +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern void init_searchsorted_functions(py::module_ m); + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/sorting/topk.cpp b/dpnp/tensor/libtensor/source/sorting/topk.cpp new file mode 100644 index 000000000000..6b8344df12c8 --- /dev/null +++ b/dpnp/tensor/libtensor/source/sorting/topk.cpp @@ -0,0 +1,303 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_sorting_impl +/// extension. +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include + +#include "kernels/sorting/topk.hpp" +#include "utils/memory_overlap.hpp" +#include "utils/output_validation.hpp" +#include "utils/rich_comparisons.hpp" +#include "utils/type_dispatch.hpp" + +#include "topk.hpp" + +namespace dpctl::tensor::py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +typedef sycl::event (*topk_impl_fn_ptr_t)(sycl::queue &, + std::size_t, + std::size_t, + std::size_t, + bool, + const char *, + char *, + char *, + const std::vector &); + +static topk_impl_fn_ptr_t topk_dispatch_vector[td_ns::num_types]; + +namespace +{ + +template +struct use_radix_sort : public std::false_type +{ +}; + +template +struct use_radix_sort< + T, + std::enable_if_t, + std::is_same, + std::is_same, + std::is_same, + std::is_same>::value>> + : public std::true_type +{ +}; + +template +sycl::event topk_caller(sycl::queue &exec_q, + std::size_t iter_nelems, // number of sub-arrays + std::size_t axis_nelems, // size of each sub-array + std::size_t k, + bool largest, + const char *arg_cp, + char *vals_cp, + char *inds_cp, + const std::vector &depends) +{ + if constexpr (use_radix_sort::value) { + using dpctl::tensor::kernels::topk_radix_impl; + auto ascending = !largest; + return topk_radix_impl(exec_q, iter_nelems, axis_nelems, + k, ascending, arg_cp, vals_cp, + inds_cp, depends); + } + else { + using dpctl::tensor::kernels::topk_merge_impl; + if (largest) { + using CompTy = + typename dpctl::tensor::rich_comparisons::DescendingSorter< + argTy>::type; + return topk_merge_impl( + exec_q, iter_nelems, axis_nelems, k, arg_cp, vals_cp, inds_cp, + depends); + } + else { + using CompTy = + typename dpctl::tensor::rich_comparisons::AscendingSorter< + argTy>::type; + return topk_merge_impl( + exec_q, iter_nelems, axis_nelems, k, arg_cp, vals_cp, inds_cp, + depends); + } + } +} + +} // namespace + +std::pair + py_topk(const dpctl::tensor::usm_ndarray &src, + std::optional trailing_dims_to_search, + const std::size_t k, + const bool largest, + const dpctl::tensor::usm_ndarray &vals, + const dpctl::tensor::usm_ndarray &inds, + sycl::queue &exec_q, + const std::vector &depends) +{ + int src_nd = src.get_ndim(); + int vals_nd = vals.get_ndim(); + int inds_nd = inds.get_ndim(); + + const py::ssize_t *src_shape_ptr = src.get_shape_raw(); + const py::ssize_t *vals_shape_ptr = vals.get_shape_raw(); + const py::ssize_t *inds_shape_ptr = inds.get_shape_raw(); + + std::size_t axis_nelems(1); + std::size_t iter_nelems(1); + if (trailing_dims_to_search.has_value()) { + if (src_nd != vals_nd || src_nd != inds_nd) { + throw py::value_error("The input and output arrays must have " + "the same array ranks"); + } + + auto trailing_dims = trailing_dims_to_search.value(); + int iter_nd = src_nd - trailing_dims; + if (trailing_dims <= 0 || iter_nd < 0) { + throw py::value_error( + "trailing_dims_to_search must be positive, but no " + "greater than rank of the array being searched"); + } + + bool same_shapes = true; + for (int i = 0; same_shapes && (i < iter_nd); ++i) { + auto src_shape_i = src_shape_ptr[i]; + same_shapes = same_shapes && (src_shape_i == vals_shape_ptr[i] && + src_shape_i == inds_shape_ptr[i]); + iter_nelems *= static_cast(src_shape_i); + } + + if (!same_shapes) { + throw py::value_error( + "Destination shape does not match the input shape"); + } + + std::size_t vals_k(1); + std::size_t inds_k(1); + for (int i = iter_nd; i < src_nd; ++i) { + axis_nelems *= static_cast(src_shape_ptr[i]); + vals_k *= static_cast(vals_shape_ptr[i]); + inds_k *= static_cast(inds_shape_ptr[i]); + } + + bool valid_k = (vals_k == k && inds_k == k && axis_nelems >= k); + if (!valid_k) { + throw py::value_error("The value of k is invalid for the input and " + "destination arrays"); + } + } + else { + if (vals_nd != 1 || inds_nd != 1) { + throw py::value_error("Output arrays must be one-dimensional"); + } + + for (int i = 0; i < src_nd; ++i) { + axis_nelems *= static_cast(src_shape_ptr[i]); + } + + bool valid_k = (axis_nelems >= k && + static_cast(vals_shape_ptr[0]) == k && + static_cast(inds_shape_ptr[0]) == k); + if (!valid_k) { + throw py::value_error("The value of k is invalid for the input and " + "destination arrays"); + } + } + + if (!dpctl::utils::queues_are_compatible(exec_q, {src, vals, inds})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + dpctl::tensor::validation::CheckWritable::throw_if_not_writable(vals); + dpctl::tensor::validation::CheckWritable::throw_if_not_writable(inds); + + if ((iter_nelems == 0) || (axis_nelems == 0)) { + // Nothing to do + return std::make_pair(sycl::event(), sycl::event()); + } + + auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); + if (overlap(src, vals) || overlap(src, inds)) { + throw py::value_error("Arrays index overlapping segments of memory"); + } + + dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(vals, + k * iter_nelems); + + dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(inds, + k * iter_nelems); + + int src_typenum = src.get_typenum(); + int vals_typenum = vals.get_typenum(); + int inds_typenum = inds.get_typenum(); + + const auto &array_types = td_ns::usm_ndarray_types(); + int src_typeid = array_types.typenum_to_lookup_id(src_typenum); + int vals_typeid = array_types.typenum_to_lookup_id(vals_typenum); + int inds_typeid = array_types.typenum_to_lookup_id(inds_typenum); + + if (src_typeid != vals_typeid) { + throw py::value_error("Input array and vals array must have " + "the same data type"); + } + + if (inds_typeid != static_cast(td_ns::typenum_t::INT64)) { + throw py::value_error("Inds array must have data type int64"); + } + + bool is_src_c_contig = src.is_c_contiguous(); + bool is_vals_c_contig = vals.is_c_contiguous(); + bool is_inds_c_contig = inds.is_c_contiguous(); + + if (is_src_c_contig && is_vals_c_contig && is_inds_c_contig) { + auto fn = topk_dispatch_vector[src_typeid]; + + sycl::event comp_ev = + fn(exec_q, iter_nelems, axis_nelems, k, largest, src.get_data(), + vals.get_data(), inds.get_data(), depends); + + sycl::event keep_args_alive_ev = + dpctl::utils::keep_args_alive(exec_q, {src, vals, inds}, {comp_ev}); + + return std::make_pair(keep_args_alive_ev, comp_ev); + } + + return std::make_pair(sycl::event(), sycl::event()); +} + +template +struct TopKFactory +{ + fnT get() + { + using IdxT = std::int64_t; + return topk_caller; + } +}; + +void init_topk_dispatch_vectors(void) +{ + td_ns::DispatchVectorBuilder + dvb; + dvb.populate_dispatch_vector(topk_dispatch_vector); +} + +void init_topk_functions(py::module_ m) +{ + init_topk_dispatch_vectors(); + + m.def("_topk", &py_topk, py::arg("src"), py::arg("trailing_dims_to_search"), + py::arg("k"), py::arg("largest"), py::arg("vals"), py::arg("inds"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/sorting/topk.hpp b/dpnp/tensor/libtensor/source/sorting/topk.hpp new file mode 100644 index 000000000000..d39c0eefdb93 --- /dev/null +++ b/dpnp/tensor/libtensor/source/sorting/topk.hpp @@ -0,0 +1,47 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_sorting_impl +/// extension. +//===----------------------------------------------------------------------===// + +#pragma once + +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern void init_topk_functions(py::module_); + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/tensor_accumulation.cpp b/dpnp/tensor/libtensor/source/tensor_accumulation.cpp new file mode 100644 index 000000000000..faa3fc8b52c6 --- /dev/null +++ b/dpnp/tensor/libtensor/source/tensor_accumulation.cpp @@ -0,0 +1,43 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_accumulation_impl +// extensions +//===----------------------------------------------------------------------===// + +#include + +#include "accumulators/accumulators_common.hpp" + +PYBIND11_MODULE(_tensor_accumulation_impl, m) +{ + dpctl::tensor::py_internal::init_accumulator_functions(m); +} diff --git a/dpnp/tensor/libtensor/source/tensor_ctors.cpp b/dpnp/tensor/libtensor/source/tensor_ctors.cpp new file mode 100644 index 000000000000..cdd6e43ed9c5 --- /dev/null +++ b/dpnp/tensor/libtensor/source/tensor_ctors.cpp @@ -0,0 +1,497 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include + +#include +#include +#include + +#include "dpnp4pybind11.hpp" + +#include "accumulators.hpp" +#include "boolean_advanced_indexing.hpp" +#include "clip.hpp" +#include "copy_and_cast_usm_to_usm.hpp" +#include "copy_as_contig.hpp" +#include "copy_for_reshape.hpp" +#include "copy_for_roll.hpp" +#include "copy_numpy_ndarray_into_usm_ndarray.hpp" +#include "device_support_queries.hpp" +#include "eye_ctor.hpp" +#include "full_ctor.hpp" +#include "integer_advanced_indexing.hpp" +#include "kernels/dpctl_tensor_types.hpp" +#include "linear_sequences.hpp" +#include "repeat.hpp" +#include "simplify_iteration_space.hpp" +#include "triul_ctor.hpp" +#include "utils/memory_overlap.hpp" +#include "utils/strided_iters.hpp" +#include "where.hpp" +#include "zeros_ctor.hpp" + +namespace py = pybind11; + +static_assert(std::is_same_v); + +namespace +{ + +using dpctl::tensor::overlap::MemoryOverlap; +using dpctl::tensor::overlap::SameLogicalTensors; + +using dpctl::tensor::py_internal::copy_usm_ndarray_into_usm_ndarray; +using dpctl::tensor::py_internal::py_as_c_contig; +using dpctl::tensor::py_internal::py_as_f_contig; + +/* =========================== Copy for reshape ============================= */ + +using dpctl::tensor::py_internal::copy_usm_ndarray_for_reshape; + +/* =========================== Copy for roll ============================= */ + +using dpctl::tensor::py_internal::copy_usm_ndarray_for_roll_1d; +using dpctl::tensor::py_internal::copy_usm_ndarray_for_roll_nd; + +/* ============= Copy from numpy.ndarray to usm_ndarray ==================== */ + +using dpctl::tensor::py_internal::copy_numpy_ndarray_into_usm_ndarray; + +/* ============= linear-sequence ==================== */ + +using dpctl::tensor::py_internal::usm_ndarray_linear_sequence_affine; +using dpctl::tensor::py_internal::usm_ndarray_linear_sequence_step; + +/* ================ Full ================== */ + +using dpctl::tensor::py_internal::usm_ndarray_full; + +/* ================ Zeros ================== */ + +using dpctl::tensor::py_internal::usm_ndarray_zeros; + +/* ============== Advanced Indexing ============= */ +using dpctl::tensor::py_internal::usm_ndarray_put; +using dpctl::tensor::py_internal::usm_ndarray_take; + +using dpctl::tensor::py_internal::py_extract; +using dpctl::tensor::py_internal::py_mask_positions; +using dpctl::tensor::py_internal::py_nonzero; +using dpctl::tensor::py_internal::py_place; + +/* ================= Repeat ====================*/ +using dpctl::tensor::py_internal::py_cumsum_1d; +using dpctl::tensor::py_internal::py_repeat_by_scalar; +using dpctl::tensor::py_internal::py_repeat_by_sequence; + +/* ================ Eye ================== */ + +using dpctl::tensor::py_internal::usm_ndarray_eye; + +/* =========================== Tril and triu ============================== */ + +using dpctl::tensor::py_internal::usm_ndarray_triul; + +/* =========================== Where ============================== */ + +using dpctl::tensor::py_internal::py_where; + +/* =========================== Clip ============================== */ +using dpctl::tensor::py_internal::py_clip; + +// populate dispatch tables +void init_dispatch_tables(void) +{ + using namespace dpctl::tensor::py_internal; + + init_copy_and_cast_usm_to_usm_dispatch_tables(); + init_copy_numpy_ndarray_into_usm_ndarray_dispatch_tables(); + init_advanced_indexing_dispatch_tables(); + init_where_dispatch_tables(); + return; +} + +// populate dispatch vectors +void init_dispatch_vectors(void) +{ + using namespace dpctl::tensor::py_internal; + + init_copy_as_contig_dispatch_vectors(); + init_copy_for_reshape_dispatch_vectors(); + init_copy_for_roll_dispatch_vectors(); + init_linear_sequences_dispatch_vectors(); + init_full_ctor_dispatch_vectors(); + init_zeros_ctor_dispatch_vectors(); + init_eye_ctor_dispatch_vectors(); + init_triul_ctor_dispatch_vectors(); + + populate_masked_extract_dispatch_vectors(); + populate_masked_place_dispatch_vectors(); + + populate_mask_positions_dispatch_vectors(); + + populate_cumsum_1d_dispatch_vectors(); + init_repeat_dispatch_vectors(); + + init_clip_dispatch_vectors(); + + return; +} + +} // namespace + +PYBIND11_MODULE(_tensor_impl, m) +{ + init_dispatch_tables(); + init_dispatch_vectors(); + + using dpctl::tensor::strides::contract_iter; + m.def( + "_contract_iter", &contract_iter, + "Simplifies iteration of array of given shape & stride. Returns " + "a triple: shape, stride and offset for the new iterator of possible " + "smaller dimension, which traverses the same elements as the original " + "iterator, possibly in a different order."); + + m.def("_copy_usm_ndarray_into_usm_ndarray", + ©_usm_ndarray_into_usm_ndarray, + "Copies from usm_ndarray `src` into usm_ndarray `dst` of the same " + "shape. " + "Returns a tuple of events: (host_task_event, compute_task_event)", + py::arg("src"), py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + + m.def("_as_c_contig", &py_as_c_contig, + "Copies from usm_ndarray `src` into C-contiguous usm_ndarray " + "`dst` of the same shape and the same data type. " + "Returns a tuple of events: (host_task_event, compute_task_event)", + py::arg("src"), py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + + m.def("_as_f_contig", &py_as_f_contig, + "Copies from usm_ndarray `src` into F-contiguous usm_ndarray " + "`dst` of the same shape and the same data type. " + "Returns a tuple of events: (host_task_event, compute_task_event)", + py::arg("src"), py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + + using dpctl::tensor::strides::contract_iter2; + m.def( + "_contract_iter2", &contract_iter2, + "Simplifies iteration over elements of pair of arrays of given shape " + "with strides stride1 and stride2. Returns " + "a 5-tuple: shape, stride and offset for the new iterator of possible " + "smaller dimension for each array, which traverses the same elements " + "as the original " + "iterator, possibly in a different order."); + + using dpctl::tensor::strides::contract_iter3; + m.def( + "_contract_iter3", &contract_iter3, + "Simplifies iteration over elements of 3-tuple of arrays of given " + "shape " + "with strides stride1, stride2, and stride3. Returns " + "a 7-tuple: shape, stride and offset for the new iterator of possible " + "smaller dimension for each array, which traverses the same elements " + "as the original " + "iterator, possibly in a different order."); + + using dpctl::tensor::strides::contract_iter4; + m.def( + "_contract_iter4", &contract_iter4, + "Simplifies iteration over elements of 4-tuple of arrays of given " + "shape " + "with strides stride1, stride2, stride3, and stride4. Returns " + "a 9-tuple: shape, stride and offset for the new iterator of possible " + "smaller dimension for each array, which traverses the same elements " + "as the original " + "iterator, possibly in a different order."); + + static constexpr char orderC = 'C'; + m.def( + "_ravel_multi_index", + [](const std::vector &mi, + const std::vector &shape, char order = 'C') { + if (order == orderC) { + return dpctl::tensor::py_internal::_ravel_multi_index_c(mi, + shape); + } + else { + return dpctl::tensor::py_internal::_ravel_multi_index_f(mi, + shape); + } + }, + ""); + + m.def( + "_unravel_index", + [](py::ssize_t flat_index, const std::vector &shape, + char order = 'C') { + if (order == orderC) { + return dpctl::tensor::py_internal::_unravel_index_c(flat_index, + shape); + } + else { + return dpctl::tensor::py_internal::_unravel_index_f(flat_index, + shape); + } + }, + ""); + + m.def("_copy_usm_ndarray_for_reshape", ©_usm_ndarray_for_reshape, + "Copies from usm_ndarray `src` into usm_ndarray `dst` with the same " + "number of elements using underlying 'C'-contiguous order for flat " + "traversal. " + "Returns a tuple of events: (ht_event, comp_event)", + py::arg("src"), py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + + m.def("_copy_usm_ndarray_for_roll_1d", ©_usm_ndarray_for_roll_1d, + "Copies from usm_ndarray `src` into usm_ndarray `dst` with the same " + "shapes using underlying 'C'-contiguous order for flat " + "traversal with shift. " + "Returns a tuple of events: (ht_event, comp_event)", + py::arg("src"), py::arg("dst"), py::arg("shift"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + m.def("_copy_usm_ndarray_for_roll_nd", ©_usm_ndarray_for_roll_nd, + "Copies from usm_ndarray `src` into usm_ndarray `dst` with the same " + "shapes using underlying 'C'-contiguous order for " + "traversal with shifts along each axis. " + "Returns a tuple of events: (ht_event, comp_event)", + py::arg("src"), py::arg("dst"), py::arg("shifts"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + m.def("_linspace_step", &usm_ndarray_linear_sequence_step, + "Fills input 1D contiguous usm_ndarray `dst` with linear sequence " + "specified by " + "starting point `start` and step `dt`. " + "Returns a tuple of events: (ht_event, comp_event)", + py::arg("start"), py::arg("dt"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + m.def("_linspace_affine", &usm_ndarray_linear_sequence_affine, + "Fills input 1D contiguous usm_ndarray `dst` with linear sequence " + "specified by " + "starting point `start` and end point `end`. " + "Returns a tuple of events: (ht_event, comp_event)", + py::arg("start"), py::arg("end"), py::arg("dst"), + py::arg("include_endpoint"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + + m.def("_copy_numpy_ndarray_into_usm_ndarray", + ©_numpy_ndarray_into_usm_ndarray, + "Copy from numpy array `src` into usm_ndarray `dst` synchronously.", + py::arg("src"), py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + + m.def("_zeros_usm_ndarray", &usm_ndarray_zeros, + "Populate usm_ndarray `dst` with zeros.", py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + m.def("_full_usm_ndarray", &usm_ndarray_full, + "Populate usm_ndarray `dst` with given fill_value.", + py::arg("fill_value"), py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + + m.def("_take", &usm_ndarray_take, + "Takes elements at usm_ndarray indices `ind` and axes starting " + "at axis `axis_start` from array `src` and copies them " + "into usm_ndarray `dst` synchronously." + "Returns a tuple of events: (hev, ev)", + py::arg("src"), py::arg("ind"), py::arg("dst"), py::arg("axis_start"), + py::arg("mode"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + + m.def("_put", &usm_ndarray_put, + "Puts elements at usm_ndarray indices `ind` and axes starting " + "at axis `axis_start` into array `dst` from " + "usm_ndarray `val` synchronously." + "Returns a tuple of events: (hev, ev)", + py::arg("dst"), py::arg("ind"), py::arg("val"), py::arg("axis_start"), + py::arg("mode"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + + m.def("_eye", &usm_ndarray_eye, + "Fills input 2D contiguous usm_ndarray `dst` with " + "zeros outside of the diagonal " + "specified by " + "the diagonal index `k` " + "which is filled with ones." + "Returns a tuple of events: (ht_event, comp_event)", + py::arg("k"), py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + + m.def("default_device_fp_type", + dpctl::tensor::py_internal::default_device_fp_type, + "Gives default floating point type supported by device.", + py::arg("dev")); + + m.def("default_device_int_type", + dpctl::tensor::py_internal::default_device_int_type, + "Gives default signed integer type supported by device.", + py::arg("dev")); + + m.def("default_device_uint_type", + dpctl::tensor::py_internal::default_device_uint_type, + "Gives default unsigned integer type supported by device.", + py::arg("dev")); + + m.def("default_device_bool_type", + dpctl::tensor::py_internal::default_device_bool_type, + "Gives default boolean type supported by device.", py::arg("dev")); + + m.def("default_device_complex_type", + dpctl::tensor::py_internal::default_device_complex_type, + "Gives default complex floating point type supported by device.", + py::arg("dev")); + + m.def("default_device_index_type", + dpctl::tensor::py_internal::default_device_index_type, + "Gives default index type supported by device.", py::arg("dev")); + + auto tril_fn = [](const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &dst, py::ssize_t k, + sycl::queue &exec_q, + const std::vector depends) + -> std::pair { + return usm_ndarray_triul(exec_q, src, dst, 'l', k, depends); + }; + m.def("_tril", tril_fn, "Tril helper function.", py::arg("src"), + py::arg("dst"), py::arg("k") = 0, py::arg("sycl_queue"), + py::arg("depends") = py::list()); + + auto triu_fn = [](const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &dst, py::ssize_t k, + sycl::queue &exec_q, + const std::vector depends) + -> std::pair { + return usm_ndarray_triul(exec_q, src, dst, 'u', k, depends); + }; + m.def("_triu", triu_fn, "Triu helper function.", py::arg("src"), + py::arg("dst"), py::arg("k") = 0, py::arg("sycl_queue"), + py::arg("depends") = py::list()); + + m.def("mask_positions", &py_mask_positions, "", py::arg("mask"), + py::arg("cumsum"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + + m.def("_cumsum_1d", &py_cumsum_1d, "", py::arg("src"), py::arg("cumsum"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + m.def("_extract", &py_extract, "", py::arg("src"), py::arg("cumsum"), + py::arg("axis_start"), py::arg("axis_end"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto overlap = [](const dpctl::tensor::usm_ndarray &x1, + const dpctl::tensor::usm_ndarray &x2) -> bool { + auto const &overlap = MemoryOverlap(); + return overlap(x1, x2); + }; + m.def("_array_overlap", overlap, + "Determines if the memory regions indexed by each array overlap", + py::arg("array1"), py::arg("array2")); + + auto same_logical_tensors = + [](const dpctl::tensor::usm_ndarray &x1, + const dpctl::tensor::usm_ndarray &x2) -> bool { + auto const &same_logical_tensors = SameLogicalTensors(); + return same_logical_tensors(x1, x2); + }; + m.def("_same_logical_tensors", same_logical_tensors, + "Determines if the memory regions indexed by each array are the same", + py::arg("array1"), py::arg("array2")); + + m.def("_place", &py_place, "", py::arg("dst"), py::arg("cumsum"), + py::arg("axis_start"), py::arg("axis_end"), py::arg("rhs"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + m.def("_nonzero", &py_nonzero, "", py::arg("cumsum"), py::arg("indexes"), + py::arg("mask_shape"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + + m.def("_where", &py_where, "", py::arg("condition"), py::arg("x1"), + py::arg("x2"), py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + + auto repeat_sequence = [](const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &dst, + const dpctl::tensor::usm_ndarray &reps, + const dpctl::tensor::usm_ndarray &cumsum, + std::optional axis, sycl::queue &exec_q, + const std::vector depends) + -> std::pair { + if (axis) { + return py_repeat_by_sequence(src, dst, reps, cumsum, axis.value(), + exec_q, depends); + } + else { + return py_repeat_by_sequence(src, dst, reps, cumsum, exec_q, + depends); + } + }; + m.def("_repeat_by_sequence", repeat_sequence, py::arg("src"), + py::arg("dst"), py::arg("reps"), py::arg("cumsum"), py::arg("axis"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto repeat_scalar = [](const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &dst, + const py::ssize_t reps, std::optional axis, + sycl::queue &exec_q, + const std::vector depends) + -> std::pair { + if (axis) { + return py_repeat_by_scalar(src, dst, reps, axis.value(), exec_q, + depends); + } + else { + return py_repeat_by_scalar(src, dst, reps, exec_q, depends); + } + }; + m.def("_repeat_by_scalar", repeat_scalar, py::arg("src"), py::arg("dst"), + py::arg("reps"), py::arg("axis"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + + m.def("_clip", &py_clip, + "Clamps elements of array `x` to the range " + "[`min`, `max] and writes the result to the " + "array `dst` for each element of `x`, `min`, and `max`." + "Returns a tuple of events: (hev, ev)", + py::arg("src"), py::arg("min"), py::arg("max"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); +} diff --git a/dpnp/tensor/libtensor/source/tensor_elementwise.cpp b/dpnp/tensor/libtensor/source/tensor_elementwise.cpp new file mode 100644 index 000000000000..76b9916ca9d3 --- /dev/null +++ b/dpnp/tensor/libtensor/source/tensor_elementwise.cpp @@ -0,0 +1,45 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_elementwise_impl +/// extension. +//===---------------------------------------------------------------------===// + +#include + +#include "elementwise_functions/elementwise_common.hpp" + +namespace py = pybind11; + +PYBIND11_MODULE(_tensor_elementwise_impl, m) +{ + dpctl::tensor::py_internal::init_elementwise_functions(m); +} diff --git a/dpnp/tensor/libtensor/source/tensor_linalg.cpp b/dpnp/tensor/libtensor/source/tensor_linalg.cpp new file mode 100644 index 000000000000..4a1b5fb79b9e --- /dev/null +++ b/dpnp/tensor/libtensor/source/tensor_linalg.cpp @@ -0,0 +1,41 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===----------------------------------------------------------------------===// + +#include "linalg_functions/dot.hpp" +#include + +PYBIND11_MODULE(_tensor_linalg_impl, m) +{ + dpctl::tensor::py_internal::init_dot(m); +} diff --git a/dpnp/tensor/libtensor/source/tensor_reductions.cpp b/dpnp/tensor/libtensor/source/tensor_reductions.cpp new file mode 100644 index 000000000000..6e6a24f7b934 --- /dev/null +++ b/dpnp/tensor/libtensor/source/tensor_reductions.cpp @@ -0,0 +1,43 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_reductions_impl +/// extension. +//===---------------------------------------------------------------------===// + +#include + +#include "reductions/reduction_common.hpp" + +PYBIND11_MODULE(_tensor_reductions_impl, m) +{ + dpctl::tensor::py_internal::init_reduction_functions(m); +} diff --git a/dpnp/tensor/libtensor/source/tensor_sorting.cpp b/dpnp/tensor/libtensor/source/tensor_sorting.cpp new file mode 100644 index 000000000000..318c3559d77c --- /dev/null +++ b/dpnp/tensor/libtensor/source/tensor_sorting.cpp @@ -0,0 +1,55 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_sorting_impl +/// extension. +//===----------------------------------------------------------------------===// + +#include + +#include "sorting/isin.hpp" +#include "sorting/merge_argsort.hpp" +#include "sorting/merge_sort.hpp" +#include "sorting/radix_argsort.hpp" +#include "sorting/radix_sort.hpp" +#include "sorting/searchsorted.hpp" +#include "sorting/topk.hpp" + +PYBIND11_MODULE(_tensor_sorting_impl, m) +{ + dpctl::tensor::py_internal::init_isin_functions(m); + dpctl::tensor::py_internal::init_merge_sort_functions(m); + dpctl::tensor::py_internal::init_merge_argsort_functions(m); + dpctl::tensor::py_internal::init_searchsorted_functions(m); + dpctl::tensor::py_internal::init_radix_sort_functions(m); + dpctl::tensor::py_internal::init_radix_argsort_functions(m); + dpctl::tensor::py_internal::init_topk_functions(m); +} diff --git a/dpnp/tensor/libtensor/source/triul_ctor.cpp b/dpnp/tensor/libtensor/source/triul_ctor.cpp new file mode 100644 index 000000000000..13e909196460 --- /dev/null +++ b/dpnp/tensor/libtensor/source/triul_ctor.cpp @@ -0,0 +1,246 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===--------------------------------------------------------------------===// + +#include // for std::copy +#include // for std::size_t +#include // for std::begin, std::end +#include // for std::make_shared +#include // for std::pair, std::move +#include // for std::vector, std::begin, std::end + +#include + +#include "dpnp4pybind11.hpp" +#include + +#include "kernels/constructors.hpp" +#include "simplify_iteration_space.hpp" +#include "utils/memory_overlap.hpp" +#include "utils/output_validation.hpp" +#include "utils/sycl_alloc_utils.hpp" +#include "utils/type_dispatch.hpp" + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace dpctl::tensor::py_internal +{ + +using dpctl::utils::keep_args_alive; + +using dpctl::tensor::kernels::constructors::tri_fn_ptr_t; + +static tri_fn_ptr_t tril_generic_dispatch_vector[td_ns::num_types]; +static tri_fn_ptr_t triu_generic_dispatch_vector[td_ns::num_types]; + +std::pair + usm_ndarray_triul(sycl::queue &exec_q, + const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &dst, + char part, + py::ssize_t k = 0, + const std::vector &depends = {}) +{ + // array dimensions must be the same + int src_nd = src.get_ndim(); + int dst_nd = dst.get_ndim(); + if (src_nd != dst_nd) { + throw py::value_error("Array dimensions are not the same."); + } + + if (src_nd < 2) { + throw py::value_error("Array dimensions less than 2."); + } + + // shapes must be the same + const py::ssize_t *src_shape = src.get_shape_raw(); + const py::ssize_t *dst_shape = dst.get_shape_raw(); + + bool shapes_equal(true); + std::size_t src_nelems(1); + + for (int i = 0; shapes_equal && i < src_nd; ++i) { + src_nelems *= static_cast(src_shape[i]); + shapes_equal = shapes_equal && (src_shape[i] == dst_shape[i]); + } + if (!shapes_equal) { + throw py::value_error("Array shapes are not the same."); + } + + if (src_nelems == 0) { + // nothing to do + return std::make_pair(sycl::event(), sycl::event()); + } + + char *src_data = src.get_data(); + char *dst_data = dst.get_data(); + + // check that arrays do not overlap, and concurrent copying is safe. + auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); + if (overlap(src, dst)) { + // TODO: could use a temporary, but this is done by the caller + throw py::value_error("Arrays index overlapping segments of memory"); + } + + auto array_types = td_ns::usm_ndarray_types(); + + int src_typenum = src.get_typenum(); + int dst_typenum = dst.get_typenum(); + int src_typeid = array_types.typenum_to_lookup_id(src_typenum); + int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum); + + if (dst_typeid != src_typeid) { + throw py::value_error("Array dtype are not the same."); + } + + // check same queues + if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) { + throw py::value_error( + "Execution queue context is not the same as allocation contexts"); + } + + dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst); + + auto src_strides = src.get_strides_vector(); + auto dst_strides = dst.get_strides_vector(); + + using shT = std::vector; + shT simplified_shape; + shT simplified_src_strides; + shT simplified_dst_strides; + py::ssize_t src_offset(0); + py::ssize_t dst_offset(0); + + int nd = src_nd - 2; + const py::ssize_t *shape = src_shape; + + const shT iter_src_strides(std::begin(src_strides), + std::begin(src_strides) + nd); + const shT iter_dst_strides(std::begin(dst_strides), + std::begin(dst_strides) + nd); + + simplify_iteration_space(nd, shape, iter_src_strides, iter_dst_strides, + // output + simplified_shape, simplified_src_strides, + simplified_dst_strides, src_offset, dst_offset); + + if (src_offset != 0 || dst_offset != 0) { + throw py::value_error("Reversed slice for dst is not supported"); + } + + nd += 2; + + using usm_host_allocatorT = + dpctl::tensor::alloc_utils::usm_host_allocator; + using usmshT = std::vector; + + usm_host_allocatorT allocator(exec_q); + auto shp_host_shape_and_strides = + std::make_shared(3 * nd, allocator); + + std::copy(simplified_shape.begin(), simplified_shape.end(), + shp_host_shape_and_strides->begin()); + (*shp_host_shape_and_strides)[nd - 2] = src_shape[src_nd - 2]; + (*shp_host_shape_and_strides)[nd - 1] = src_shape[src_nd - 1]; + + std::copy(simplified_src_strides.begin(), simplified_src_strides.end(), + shp_host_shape_and_strides->begin() + nd); + (*shp_host_shape_and_strides)[2 * nd - 2] = src_strides[src_nd - 2]; + (*shp_host_shape_and_strides)[2 * nd - 1] = src_strides[src_nd - 1]; + + std::copy(simplified_dst_strides.begin(), simplified_dst_strides.end(), + shp_host_shape_and_strides->begin() + 2 * nd); + (*shp_host_shape_and_strides)[3 * nd - 2] = dst_strides[src_nd - 2]; + (*shp_host_shape_and_strides)[3 * nd - 1] = dst_strides[src_nd - 1]; + + auto dev_shape_and_strides_owner = + dpctl::tensor::alloc_utils::smart_malloc_device(3 * nd, + exec_q); + py::ssize_t *dev_shape_and_strides = dev_shape_and_strides_owner.get(); + + const sycl::event ©_shape_and_strides = exec_q.copy( + shp_host_shape_and_strides->data(), dev_shape_and_strides, 3 * nd); + + py::ssize_t inner_range = src_shape[src_nd - 1] * src_shape[src_nd - 2]; + py::ssize_t outer_range = src_nelems / inner_range; + + sycl::event tri_ev; + if (part == 'l') { + auto fn = tril_generic_dispatch_vector[src_typeid]; + tri_ev = + fn(exec_q, inner_range, outer_range, src_data, dst_data, nd, + dev_shape_and_strides, k, depends, {copy_shape_and_strides}); + } + else { + auto fn = triu_generic_dispatch_vector[src_typeid]; + tri_ev = + fn(exec_q, inner_range, outer_range, src_data, dst_data, nd, + dev_shape_and_strides, k, depends, {copy_shape_and_strides}); + } + + const auto &temporaries_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(tri_ev); + const auto &ctx = exec_q.get_context(); + using dpctl::tensor::alloc_utils::sycl_free_noexcept; + cgh.host_task( + [shp_host_shape_and_strides = std::move(shp_host_shape_and_strides), + dev_shape_and_strides, ctx]() { + // capture of shp_host_shape_and_strides ensure the underlying + // vector exists for the entire execution of copying kernel + sycl_free_noexcept(dev_shape_and_strides, ctx); + }); + }); + // since host_task now owns USM allocation, release ownership by smart + // pointer + dev_shape_and_strides_owner.release(); + + return std::make_pair( + keep_args_alive(exec_q, {src, dst}, {temporaries_cleanup_ev}), tri_ev); +} + +void init_triul_ctor_dispatch_vectors(void) +{ + + using namespace td_ns; + using dpctl::tensor::kernels::constructors::TrilGenericFactory; + using dpctl::tensor::kernels::constructors::TriuGenericFactory; + + DispatchVectorBuilder dvb1; + dvb1.populate_dispatch_vector(tril_generic_dispatch_vector); + + DispatchVectorBuilder dvb2; + dvb2.populate_dispatch_vector(triu_generic_dispatch_vector); +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/triul_ctor.hpp b/dpnp/tensor/libtensor/source/triul_ctor.hpp new file mode 100644 index 000000000000..47cc4ce8892d --- /dev/null +++ b/dpnp/tensor/libtensor/source/triul_ctor.hpp @@ -0,0 +1,58 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===--------------------------------------------------------------------===// + +#pragma once +#include +#include +#include + +#include "dpnp4pybind11.hpp" +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern std::pair + usm_ndarray_triul(sycl::queue &exec_q, + const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &dst, + char part, + py::ssize_t k = 0, + const std::vector &depends = {}); + +extern void init_triul_ctor_dispatch_vectors(void); + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/where.cpp b/dpnp/tensor/libtensor/source/where.cpp new file mode 100644 index 000000000000..1d535a712917 --- /dev/null +++ b/dpnp/tensor/libtensor/source/where.cpp @@ -0,0 +1,264 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines Python API for implementation functions of +/// dpctl.tensor.where +//===---------------------------------------------------------------------===// + +#include +#include +#include +#include +#include + +#include + +#include "dpnp4pybind11.hpp" +#include + +#include "kernels/where.hpp" +#include "utils/memory_overlap.hpp" +#include "utils/offset_utils.hpp" +#include "utils/output_validation.hpp" +#include "utils/sycl_alloc_utils.hpp" +#include "utils/type_dispatch.hpp" + +#include "simplify_iteration_space.hpp" +#include "where.hpp" + +namespace dpctl::tensor::py_internal +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +using dpctl::tensor::kernels::search::where_contig_impl_fn_ptr_t; +using dpctl::tensor::kernels::search::where_strided_impl_fn_ptr_t; + +static where_contig_impl_fn_ptr_t where_contig_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static where_strided_impl_fn_ptr_t + where_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; + +using dpctl::utils::keep_args_alive; + +std::pair + py_where(const dpctl::tensor::usm_ndarray &condition, + const dpctl::tensor::usm_ndarray &x1, + const dpctl::tensor::usm_ndarray &x2, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends) +{ + + if (!dpctl::utils::queues_are_compatible(exec_q, + {x1, x2, condition, dst})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst); + + int nd = condition.get_ndim(); + int x1_nd = x1.get_ndim(); + int x2_nd = x2.get_ndim(); + int dst_nd = dst.get_ndim(); + + if (nd != x1_nd || nd != x2_nd) { + throw py::value_error( + "Input arrays are not of appropriate dimension for where kernel."); + } + + if (nd != dst_nd) { + throw py::value_error( + "Destination is not of appropriate dimension for where kernel."); + } + + const py::ssize_t *x1_shape = x1.get_shape_raw(); + const py::ssize_t *x2_shape = x2.get_shape_raw(); + const py::ssize_t *dst_shape = dst.get_shape_raw(); + const py::ssize_t *cond_shape = condition.get_shape_raw(); + + bool shapes_equal(true); + std::size_t nelems(1); + for (int i = 0; i < nd; ++i) { + const auto &sh_i = dst_shape[i]; + nelems *= static_cast(sh_i); + shapes_equal = shapes_equal && (x1_shape[i] == sh_i) && + (x2_shape[i] == sh_i) && (cond_shape[i] == sh_i); + } + + if (!shapes_equal) { + throw py::value_error("Axes are not of matching shapes."); + } + + if (nelems == 0) { + return std::make_pair(sycl::event{}, sycl::event{}); + } + + auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); + auto const &same_logical_tensors = + dpctl::tensor::overlap::SameLogicalTensors(); + if ((overlap(dst, condition) && !same_logical_tensors(dst, condition)) || + (overlap(dst, x1) && !same_logical_tensors(dst, x1)) || + (overlap(dst, x2) && !same_logical_tensors(dst, x2))) { + throw py::value_error("Destination array overlaps with input."); + } + + int x1_typenum = x1.get_typenum(); + int x2_typenum = x2.get_typenum(); + int cond_typenum = condition.get_typenum(); + int dst_typenum = dst.get_typenum(); + + auto const &array_types = td_ns::usm_ndarray_types(); + int cond_typeid = array_types.typenum_to_lookup_id(cond_typenum); + int x1_typeid = array_types.typenum_to_lookup_id(x1_typenum); + int x2_typeid = array_types.typenum_to_lookup_id(x2_typenum); + int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum); + + if (x1_typeid != x2_typeid || x1_typeid != dst_typeid) { + throw py::value_error("Value arrays must have the same data type"); + } + + dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(dst, nelems); + + char *cond_data = condition.get_data(); + char *x1_data = x1.get_data(); + char *x2_data = x2.get_data(); + char *dst_data = dst.get_data(); + + bool is_x1_c_contig = x1.is_c_contiguous(); + bool is_x1_f_contig = x1.is_f_contiguous(); + + bool is_x2_c_contig = x2.is_c_contiguous(); + bool is_x2_f_contig = x2.is_f_contiguous(); + + bool is_cond_c_contig = condition.is_c_contiguous(); + bool is_cond_f_contig = condition.is_f_contiguous(); + + bool is_dst_c_contig = dst.is_c_contiguous(); + bool is_dst_f_contig = dst.is_f_contiguous(); + + bool all_c_contig = (is_x1_c_contig && is_x2_c_contig && is_cond_c_contig && + is_dst_c_contig); + bool all_f_contig = (is_x1_f_contig && is_x2_f_contig && is_cond_f_contig && + is_dst_f_contig); + + if (all_c_contig || all_f_contig) { + auto contig_fn = where_contig_dispatch_table[x1_typeid][cond_typeid]; + + auto where_ev = contig_fn(exec_q, nelems, cond_data, x1_data, x2_data, + dst_data, depends); + sycl::event ht_ev = + keep_args_alive(exec_q, {x1, x2, dst, condition}, {where_ev}); + + return std::make_pair(ht_ev, where_ev); + } + + auto const &cond_strides = condition.get_strides_vector(); + auto const &x1_strides = x1.get_strides_vector(); + auto const &x2_strides = x2.get_strides_vector(); + auto const &dst_strides = dst.get_strides_vector(); + + using shT = std::vector; + shT simplified_shape; + shT simplified_cond_strides; + shT simplified_x1_strides; + shT simplified_x2_strides; + shT simplified_dst_strides; + py::ssize_t cond_offset(0); + py::ssize_t x1_offset(0); + py::ssize_t x2_offset(0); + py::ssize_t dst_offset(0); + + simplify_iteration_space_4( + nd, x1_shape, cond_strides, x1_strides, x2_strides, dst_strides, + // outputs + simplified_shape, simplified_cond_strides, simplified_x1_strides, + simplified_x2_strides, simplified_dst_strides, cond_offset, x1_offset, + x2_offset, dst_offset); + + auto fn = where_strided_dispatch_table[x1_typeid][cond_typeid]; + + std::vector host_task_events; + host_task_events.reserve(2); + + using dpctl::tensor::offset_utils::device_allocate_and_pack; + auto ptr_size_event_tuple = device_allocate_and_pack( + exec_q, host_task_events, + // common shape and strides + simplified_shape, simplified_cond_strides, simplified_x1_strides, + simplified_x2_strides, simplified_dst_strides); + auto packed_shape_strides_owner = + std::move(std::get<0>(ptr_size_event_tuple)); + sycl::event copy_shape_strides_ev = std::get<2>(ptr_size_event_tuple); + const py::ssize_t *packed_shape_strides = packed_shape_strides_owner.get(); + + std::vector all_deps; + all_deps.reserve(depends.size() + 1); + all_deps.insert(all_deps.end(), depends.begin(), depends.end()); + all_deps.push_back(copy_shape_strides_ev); + + assert(all_deps.size() == depends.size() + 1); + + sycl::event where_ev = fn(exec_q, nelems, nd, cond_data, x1_data, x2_data, + dst_data, packed_shape_strides, cond_offset, + x1_offset, x2_offset, dst_offset, all_deps); + + // free packed temporaries + sycl::event temporaries_cleanup_ev = + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {where_ev}, packed_shape_strides_owner); + host_task_events.push_back(temporaries_cleanup_ev); + + sycl::event arg_cleanup_ev = + keep_args_alive(exec_q, {x1, x2, condition, dst}, host_task_events); + + return std::make_pair(arg_cleanup_ev, where_ev); +} + +void init_where_dispatch_tables(void) +{ + using namespace td_ns; + using dpctl::tensor::kernels::search::WhereContigFactory; + DispatchTableBuilder + dtb1; + dtb1.populate_dispatch_table(where_contig_dispatch_table); + + using dpctl::tensor::kernels::search::WhereStridedFactory; + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(where_strided_dispatch_table); +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/where.hpp b/dpnp/tensor/libtensor/source/where.hpp new file mode 100644 index 000000000000..ba81d8b11642 --- /dev/null +++ b/dpnp/tensor/libtensor/source/where.hpp @@ -0,0 +1,57 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file declares Python API for implementation functions of +/// dpctl.tensor.where +//===---------------------------------------------------------------------===// + +#pragma once +#include +#include + +#include + +#include "dpnp4pybind11.hpp" + +namespace dpctl::tensor::py_internal +{ + +extern std::pair + py_where(const dpctl::tensor::usm_ndarray &, + const dpctl::tensor::usm_ndarray &, + const dpctl::tensor::usm_ndarray &, + const dpctl::tensor::usm_ndarray &, + sycl::queue &, + const std::vector &); + +extern void init_where_dispatch_tables(void); + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/zeros_ctor.cpp b/dpnp/tensor/libtensor/source/zeros_ctor.cpp new file mode 100644 index 000000000000..b9a2e01bea4a --- /dev/null +++ b/dpnp/tensor/libtensor/source/zeros_ctor.cpp @@ -0,0 +1,159 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===--------------------------------------------------------------------===// + +#include +#include +#include +#include + +#include + +#include "dpnp4pybind11.hpp" +#include + +#include "utils/output_validation.hpp" +#include "utils/type_dispatch.hpp" + +#include "zeros_ctor.hpp" + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace dpctl::tensor::py_internal +{ + +using dpctl::utils::keep_args_alive; + +typedef sycl::event (*zeros_contig_fn_ptr_t)(sycl::queue &, + std::size_t, + char *, + const std::vector &); + +/*! + * @brief Function to submit kernel to fill given contiguous memory allocation + * with zeros. + * + * @param exec_q Sycl queue to which kernel is submitted for execution. + * @param nelems Length of the sequence + * @param dst_p Kernel accessible USM pointer to the start of array to be + * populated. + * @param depends List of events to wait for before starting computations, if + * any. + * + * @return Event to wait on to ensure that computation completes. + * @defgroup CtorKernels + */ +template +sycl::event zeros_contig_impl(sycl::queue &exec_q, + std::size_t nelems, + char *dst_p, + const std::vector &depends) +{ + + static constexpr int memset_val(0); + sycl::event fill_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + cgh.memset(reinterpret_cast(dst_p), memset_val, + nelems * sizeof(dstTy)); + }); + + return fill_ev; +} + +template +struct ZerosContigFactory +{ + fnT get() + { + fnT f = zeros_contig_impl; + return f; + } +}; + +static zeros_contig_fn_ptr_t zeros_contig_dispatch_vector[td_ns::num_types]; + +std::pair + usm_ndarray_zeros(const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends) +{ + py::ssize_t dst_nelems = dst.get_size(); + + if (dst_nelems == 0) { + // nothing to do + return std::make_pair(sycl::event(), sycl::event()); + } + + if (!dpctl::utils::queues_are_compatible(exec_q, {dst})) { + throw py::value_error( + "Execution queue is not compatible with the allocation queue"); + } + + dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst); + + auto array_types = td_ns::usm_ndarray_types(); + int dst_typenum = dst.get_typenum(); + int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum); + + char *dst_data = dst.get_data(); + + if (dst_nelems == 1 || dst.is_c_contiguous() || dst.is_f_contiguous()) { + auto fn = zeros_contig_dispatch_vector[dst_typeid]; + + sycl::event zeros_contig_event = + fn(exec_q, static_cast(dst_nelems), dst_data, depends); + + return std::make_pair( + keep_args_alive(exec_q, {dst}, {zeros_contig_event}), + zeros_contig_event); + } + else { + throw std::runtime_error( + "Only population of contiguous usm_ndarray objects is supported."); + } +} + +void init_zeros_ctor_dispatch_vectors(void) +{ + using namespace td_ns; + + DispatchVectorBuilder + dvb; + dvb.populate_dispatch_vector(zeros_contig_dispatch_vector); + + return; +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tensor/libtensor/source/zeros_ctor.hpp b/dpnp/tensor/libtensor/source/zeros_ctor.hpp new file mode 100644 index 000000000000..d104e37f5533 --- /dev/null +++ b/dpnp/tensor/libtensor/source/zeros_ctor.hpp @@ -0,0 +1,53 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===--------------------------------------------------------------------===// + +#pragma once +#include +#include + +#include + +#include "dpnp4pybind11.hpp" + +namespace dpctl::tensor::py_internal +{ + +extern std::pair + usm_ndarray_zeros(const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends = {}); + +extern void init_zeros_ctor_dispatch_vectors(void); + +} // namespace dpctl::tensor::py_internal diff --git a/dpnp/tests/config.py b/dpnp/tests/config.py index a49fd8cad250..e576c643695b 100644 --- a/dpnp/tests/config.py +++ b/dpnp/tests/config.py @@ -4,6 +4,7 @@ float16_types = bool(os.getenv("DPNP_TEST_FLOAT_16", 0)) complex_types = bool(os.getenv("DPNP_TEST_COMPLEX_TYPES", 0)) bool_types = bool(os.getenv("DPNP_TEST_BOOL_TYPES", 0)) +skip_tensor_tests = bool(int(os.getenv("SKIP_TENSOR_TESTS", 0))) infra_warnings_enable = bool(os.getenv("DPNP_INFRA_WARNINGS_ENABLE", 0)) diff --git a/dpnp/tests/conftest.py b/dpnp/tests/conftest.py index 5d766566bca5..8e3cb97ad41f 100644 --- a/dpnp/tests/conftest.py +++ b/dpnp/tests/conftest.py @@ -97,6 +97,10 @@ def pytest_configure(config): # Equivalent to norecursedirs = tests_perf config.addinivalue_line("norecursedirs", "tests_perf") + # Equivalent to norecursedirs = tests/tensor (conditional) + if dtype_config.skip_tensor_tests: + config.addinivalue_line("norecursedirs", "tests/tensor") + # Register pytest markers config.addinivalue_line( "markers", "slow: marks tests as slow (deselect with '-m \"not slow\"')" diff --git a/dpnp/tests/tensor/__init__.py b/dpnp/tests/tensor/__init__.py new file mode 100644 index 000000000000..b18d8ddc7dd1 --- /dev/null +++ b/dpnp/tests/tensor/__init__.py @@ -0,0 +1,31 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +__doc__ = r""" +Test suite for tensor functionality migrated from dpctl. +Running test suite requires Cython and a working compiler.""" diff --git a/dpnp/tests/tensor/conftest.py b/dpnp/tests/tensor/conftest.py new file mode 100644 index 000000000000..ea10d1322e76 --- /dev/null +++ b/dpnp/tests/tensor/conftest.py @@ -0,0 +1,31 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +"""Configures pytest to discover helper/ module""" + +from dpnp.tests.conftest import suppress_invalid_numpy_warnings diff --git a/dpnp/tests/tensor/elementwise/__init__.py b/dpnp/tests/tensor/elementwise/__init__.py new file mode 100644 index 000000000000..a794242cd7bb --- /dev/null +++ b/dpnp/tests/tensor/elementwise/__init__.py @@ -0,0 +1,32 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +""" +Collection of test and utility files for testing elementwise operations +over :class:`dpnp.tensor.usm_ndarray`. +""" diff --git a/dpnp/tests/tensor/elementwise/test_abs.py b/dpnp/tests/tensor/elementwise/test_abs.py new file mode 100644 index 000000000000..535aebfb4d58 --- /dev/null +++ b/dpnp/tests/tensor/elementwise/test_abs.py @@ -0,0 +1,224 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import itertools +import warnings + +import numpy as np +import pytest + +import dpnp.tensor as dpt + +from ..helper import ( + get_queue_or_skip, + skip_if_dtype_not_supported, +) +from .utils import ( + _all_dtypes, + _complex_fp_dtypes, + _real_fp_dtypes, + _usm_types, +) + + +@pytest.mark.parametrize("dtype", _all_dtypes) +def test_abs_out_type(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + arg_dt = np.dtype(dtype) + X = dpt.asarray(0, dtype=arg_dt, sycl_queue=q) + if np.issubdtype(arg_dt, np.complexfloating): + type_map = { + np.dtype("c8"): np.dtype("f4"), + np.dtype("c16"): np.dtype("f8"), + } + assert dpt.abs(X).dtype == type_map[arg_dt] + + r = dpt.empty_like(X, dtype=type_map[arg_dt]) + dpt.abs(X, out=r) + assert np.allclose(dpt.asnumpy(r), dpt.asnumpy(dpt.abs(X))) + else: + assert dpt.abs(X).dtype == arg_dt + + r = dpt.empty_like(X, dtype=arg_dt) + dpt.abs(X, out=r) + assert np.allclose(dpt.asnumpy(r), dpt.asnumpy(dpt.abs(X))) + + +@pytest.mark.parametrize("usm_type", _usm_types) +def test_abs_usm_type(usm_type): + q = get_queue_or_skip() + + arg_dt = np.dtype("i4") + input_shape = (10, 10, 10, 10) + X = dpt.empty(input_shape, dtype=arg_dt, usm_type=usm_type, sycl_queue=q) + X[..., 0::2] = 1 + X[..., 1::2] = 0 + + Y = dpt.abs(X) + assert Y.usm_type == X.usm_type + assert Y.sycl_queue == X.sycl_queue + assert Y.flags.c_contiguous + + expected_Y = dpt.asnumpy(X) + assert np.allclose(dpt.asnumpy(Y), expected_Y) + + +def test_abs_types_property(): + get_queue_or_skip() + types = dpt.abs.types + assert isinstance(types, list) + assert len(types) > 0 + assert types == dpt.abs.types_ + + +@pytest.mark.parametrize("dtype", _all_dtypes[1:]) +def test_abs_order(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + arg_dt = np.dtype(dtype) + exp_dt = np.abs(np.ones(tuple(), dtype=arg_dt)).dtype + input_shape = (10, 10, 10, 10) + X = dpt.empty(input_shape, dtype=arg_dt, sycl_queue=q) + X[..., 0::2] = 1 + X[..., 1::2] = 0 + + for perms in itertools.permutations(range(4)): + U = dpt.permute_dims(X[:, ::-1, ::-1, :], perms) + expected_Y = np.ones(U.shape, dtype=exp_dt) + expected_Y[..., 1::2] = 0 + expected_Y = np.transpose(expected_Y, perms) + for ord in ["C", "F", "A", "K"]: + Y = dpt.abs(U, order=ord) + assert np.allclose(dpt.asnumpy(Y), expected_Y) + + +@pytest.mark.parametrize("dtype", ["c8", "c16"]) +def test_abs_complex(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + arg_dt = np.dtype(dtype) + input_shape = (10, 10, 10, 10) + X = dpt.empty(input_shape, dtype=arg_dt, sycl_queue=q) + Xnp = np.random.standard_normal( + size=input_shape + ) + 1j * np.random.standard_normal(size=input_shape) + Xnp = Xnp.astype(arg_dt) + X[...] = Xnp + + for ord in ["C", "F", "A", "K"]: + for perms in itertools.permutations(range(4)): + U = dpt.permute_dims(X[:, ::-1, ::-1, :], perms) + Y = dpt.abs(U, order=ord) + expected_Y = np.abs(np.transpose(Xnp[:, ::-1, ::-1, :], perms)) + tol = dpt.finfo(Y.dtype).resolution + np.testing.assert_allclose( + dpt.asnumpy(Y), expected_Y, atol=tol, rtol=tol + ) + + +def test_abs_out_overlap(): + get_queue_or_skip() + + X = dpt.arange(-3, 3, 1, dtype="i4") + expected = dpt.asarray([3, 2, 1, 0, 1, 2], dtype="i4") + Y = dpt.abs(X, out=X) + + assert Y is X + assert dpt.all(expected == X) + + X = dpt.arange(-3, 3, 1, dtype="i4") + expected = expected[::-1] + Y = dpt.abs(X, out=X[::-1]) + assert Y is not X + assert dpt.all(expected == X) + + +@pytest.mark.parametrize("dtype", _real_fp_dtypes) +def test_abs_real_fp_special_values(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + nans_ = [dpt.nan, -dpt.nan] + infs_ = [dpt.inf, -dpt.inf] + finites_ = [-1.0, -0.0, 0.0, 1.0] + inps_ = nans_ + infs_ + finites_ + + x = dpt.asarray(inps_, dtype=dtype) + r = dpt.abs(x) + + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + expected_np = np.abs(np.asarray(inps_, dtype=dtype)) + + expected = dpt.asarray(expected_np, dtype=dtype) + tol = dpt.finfo(r.dtype).resolution + + assert dpt.allclose(r, expected, atol=tol, rtol=tol, equal_nan=True) + + +@pytest.mark.parametrize("dtype", _complex_fp_dtypes) +def test_abs_complex_fp_special_values(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + nans_ = [dpt.nan, -dpt.nan] + infs_ = [dpt.inf, -dpt.inf] + finites_ = [-1.0, -0.0, 0.0, 1.0] + inps_ = nans_ + infs_ + finites_ + c_ = [complex(*v) for v in itertools.product(inps_, repeat=2)] + + z = dpt.asarray(c_, dtype=dtype) + r = dpt.abs(z) + + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + expected_np = np.abs(np.asarray(c_, dtype=dtype)) + + expected = dpt.asarray(expected_np, dtype=dtype) + tol = dpt.finfo(r.dtype).resolution + + assert dpt.allclose(r, expected, atol=tol, rtol=tol, equal_nan=True) + + +@pytest.mark.parametrize("dtype", _all_dtypes) +def test_abs_alignment(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + x = dpt.ones(512, dtype=dtype) + r = dpt.abs(x) + + r2 = dpt.abs(x[1:]) + assert np.allclose(dpt.asnumpy(r[1:]), dpt.asnumpy(r2)) + + dpt.abs(x[:-1], out=r[1:]) + assert np.allclose(dpt.asnumpy(r[1:]), dpt.asnumpy(r2)) diff --git a/dpnp/tests/tensor/elementwise/test_add.py b/dpnp/tests/tensor/elementwise/test_add.py new file mode 100644 index 000000000000..28a4efb21e94 --- /dev/null +++ b/dpnp/tests/tensor/elementwise/test_add.py @@ -0,0 +1,590 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import ctypes +import re + +import dpctl +import numpy as np +import pytest + +import dpnp.tensor as dpt +from dpnp.tensor._type_utils import _can_cast + +from ..helper import ( + get_queue_or_skip, + skip_if_dtype_not_supported, +) +from .utils import ( + _all_dtypes, + _compare_dtypes, + _usm_types, +) + + +@pytest.mark.parametrize("op1_dtype", _all_dtypes) +@pytest.mark.parametrize("op2_dtype", _all_dtypes) +def test_add_dtype_matrix(op1_dtype, op2_dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(op1_dtype, q) + skip_if_dtype_not_supported(op2_dtype, q) + + sz = 127 + ar1 = dpt.ones(sz, dtype=op1_dtype) + ar2 = dpt.ones_like(ar1, dtype=op2_dtype) + + r = dpt.add(ar1, ar2) + assert isinstance(r, dpt.usm_ndarray) + expected_dtype = np.add( + np.zeros(1, dtype=op1_dtype), np.zeros(1, dtype=op2_dtype) + ).dtype + assert _compare_dtypes(r.dtype, expected_dtype, sycl_queue=q) + assert r.shape == ar1.shape + assert (dpt.asnumpy(r) == np.full(r.shape, 2, dtype=r.dtype)).all() + assert r.sycl_queue == ar1.sycl_queue + + r2 = dpt.empty_like(ar1, dtype=r.dtype) + dpt.add(ar1, ar2, out=r2) + assert (dpt.asnumpy(r2) == np.full(r2.shape, 2, dtype=r2.dtype)).all() + + ar3 = dpt.ones(sz, dtype=op1_dtype) + ar4 = dpt.ones(2 * sz, dtype=op2_dtype) + + r = dpt.add(ar3[::-1], ar4[::2]) + assert isinstance(r, dpt.usm_ndarray) + expected_dtype = np.add( + np.zeros(1, dtype=op1_dtype), np.zeros(1, dtype=op2_dtype) + ).dtype + assert _compare_dtypes(r.dtype, expected_dtype, sycl_queue=q) + assert r.shape == ar3.shape + assert (dpt.asnumpy(r) == np.full(r.shape, 2, dtype=r.dtype)).all() + + r2 = dpt.empty_like(ar1, dtype=r.dtype) + dpt.add(ar3[::-1], ar4[::2], out=r2) + assert (dpt.asnumpy(r2) == np.full(r2.shape, 2, dtype=r2.dtype)).all() + + +@pytest.mark.parametrize("op1_usm_type", _usm_types) +@pytest.mark.parametrize("op2_usm_type", _usm_types) +def test_add_usm_type_matrix(op1_usm_type, op2_usm_type): + get_queue_or_skip() + + sz = 128 + ar1 = dpt.ones(sz, dtype="i4", usm_type=op1_usm_type) + ar2 = dpt.ones_like(ar1, dtype="i4", usm_type=op2_usm_type) + + r = dpt.add(ar1, ar2) + assert isinstance(r, dpt.usm_ndarray) + expected_usm_type = dpt.get_coerced_usm_type((op1_usm_type, op2_usm_type)) + assert r.usm_type == expected_usm_type + + +def test_add_order(): + get_queue_or_skip() + + test_shape = ( + 20, + 20, + ) + test_shape2 = tuple(2 * dim for dim in test_shape) + n = test_shape[-1] + + for dt1, dt2 in zip(["i4", "i4", "f4"], ["i4", "f4", "i4"]): + ar1 = dpt.ones(test_shape, dtype=dt1, order="C") + ar2 = dpt.ones(test_shape, dtype=dt2, order="C") + r1 = dpt.add(ar1, ar2, order="C") + assert r1.flags.c_contiguous + r2 = dpt.add(ar1, ar2, order="F") + assert r2.flags.f_contiguous + r3 = dpt.add(ar1, ar2, order="A") + assert r3.flags.c_contiguous + r4 = dpt.add(ar1, ar2, order="K") + assert r4.flags.c_contiguous + + ar1 = dpt.ones(test_shape, dtype=dt1, order="F") + ar2 = dpt.ones(test_shape, dtype=dt2, order="F") + r1 = dpt.add(ar1, ar2, order="C") + assert r1.flags.c_contiguous + r2 = dpt.add(ar1, ar2, order="F") + assert r2.flags.f_contiguous + r3 = dpt.add(ar1, ar2, order="A") + assert r3.flags.f_contiguous + r4 = dpt.add(ar1, ar2, order="K") + assert r4.flags.f_contiguous + + ar1 = dpt.ones(test_shape2, dtype=dt1, order="C")[:20, ::-2] + ar2 = dpt.ones(test_shape2, dtype=dt2, order="C")[:20, ::-2] + r4 = dpt.add(ar1, ar2, order="K") + assert r4.strides == (n, -1) + r5 = dpt.add(ar1, ar2, order="C") + assert r5.strides == (n, 1) + + ar1 = dpt.ones(test_shape2, dtype=dt1, order="C")[:20, ::-2].mT + ar2 = dpt.ones(test_shape2, dtype=dt2, order="C")[:20, ::-2].mT + r4 = dpt.add(ar1, ar2, order="K") + assert r4.strides == (-1, n) + r5 = dpt.add(ar1, ar2, order="C") + assert r5.strides == (n, 1) + + +def test_add_broadcasting(): + get_queue_or_skip() + + m = dpt.ones((100, 5), dtype="i4") + v = dpt.arange(5, dtype="i4") + + r = dpt.add(m, v) + assert (dpt.asnumpy(r) == np.arange(1, 6, dtype="i4")[np.newaxis, :]).all() + + r2 = dpt.add(v, m) + assert (dpt.asnumpy(r2) == np.arange(1, 6, dtype="i4")[np.newaxis, :]).all() + + r3 = dpt.empty_like(m) + dpt.add(m, v, out=r3) + assert (dpt.asnumpy(r3) == np.arange(1, 6, dtype="i4")[np.newaxis, :]).all() + + r4 = dpt.empty_like(m) + dpt.add(v, m, out=r4) + assert (dpt.asnumpy(r4) == np.arange(1, 6, dtype="i4")[np.newaxis, :]).all() + + +def test_add_broadcasting_new_shape(): + get_queue_or_skip() + + ar1 = dpt.ones((6, 1), dtype="i4") + ar2 = dpt.arange(6, dtype="i4") + + r = dpt.add(ar1, ar2) + assert (dpt.asnumpy(r) == np.arange(1, 7, dtype="i4")[np.newaxis, :]).all() + + r1 = dpt.add(ar2, ar1) + assert (dpt.asnumpy(r1) == np.arange(1, 7, dtype="i4")[np.newaxis, :]).all() + + r2 = dpt.add(ar1[::2], ar2[::2]) + assert ( + dpt.asnumpy(r2) == np.arange(1, 7, dtype="i4")[::2][np.newaxis, :] + ).all() + + r3 = dpt.empty_like(ar1) + with pytest.raises(ValueError): + dpt.add(ar1, ar2, out=r3) + + ar3 = dpt.ones((6, 1), dtype="i4") + ar4 = dpt.ones((1, 6), dtype="i4") + + r4 = dpt.add(ar3, ar4) + assert (dpt.asnumpy(r4) == np.full((6, 6), 2, dtype="i4")).all() + + r5 = dpt.add(ar4, ar3) + assert (dpt.asnumpy(r5) == np.full((6, 6), 2, dtype="i4")).all() + + r6 = dpt.add(ar3[::2], ar4[:, ::2]) + assert (dpt.asnumpy(r6) == np.full((3, 3), 2, dtype="i4")).all() + + r7 = dpt.add(ar3[::2], ar4) + assert (dpt.asnumpy(r7) == np.full((3, 6), 2, dtype="i4")).all() + + +def test_add_broadcasting_error(): + get_queue_or_skip() + m = dpt.ones((10, 10), dtype="i4") + v = dpt.ones((3,), dtype="i4") + with pytest.raises(ValueError): + dpt.add(m, v) + + +@pytest.mark.parametrize("arr_dt", _all_dtypes) +def test_add_python_scalar(arr_dt): + q = get_queue_or_skip() + skip_if_dtype_not_supported(arr_dt, q) + + X = dpt.zeros((10, 10), dtype=arr_dt, sycl_queue=q) + py_zeros = ( + bool(0), + int(0), + float(0), + complex(0), + np.float32(0), + ctypes.c_int(0), + ) + for sc in py_zeros: + R = dpt.add(X, sc) + assert isinstance(R, dpt.usm_ndarray) + R = dpt.add(sc, X) + assert isinstance(R, dpt.usm_ndarray) + + +class MockArray: + def __init__(self, arr): + self.data_ = arr + + @property + def __sycl_usm_array_interface__(self): + return self.data_.__sycl_usm_array_interface__ + + +def test_add_mock_array(): + get_queue_or_skip() + a = dpt.arange(10) + b = dpt.ones(10) + c = MockArray(b) + r = dpt.add(a, c) + assert isinstance(r, dpt.usm_ndarray) + + +def test_add_canary_mock_array(): + get_queue_or_skip() + a = dpt.arange(10) + + class Canary: + def __init__(self): + pass + + @property + def __sycl_usm_array_interface__(self): + return None + + c = Canary() + with pytest.raises(ValueError): + dpt.add(a, c) + + +def test_add_types_property(): + get_queue_or_skip() + types = dpt.add.types + assert isinstance(types, list) + assert len(types) > 0 + assert types == dpt.add.types_ + + +def test_add_errors(): + get_queue_or_skip() + try: + gpu_queue = dpctl.SyclQueue("gpu") + except dpctl.SyclQueueCreationError: + pytest.skip("SyclQueue('gpu') failed, skipping") + try: + cpu_queue = dpctl.SyclQueue("cpu") + except dpctl.SyclQueueCreationError: + pytest.skip("SyclQueue('cpu') failed, skipping") + + ar1 = dpt.ones(2, dtype="float32", sycl_queue=gpu_queue) + ar2 = dpt.ones_like(ar1, sycl_queue=gpu_queue) + y = dpt.empty_like(ar1, sycl_queue=cpu_queue) + with pytest.raises(dpt.ExecutionPlacementError) as excinfo: + dpt.add(ar1, ar2, out=y) + assert "Input and output allocation queues are not compatible" in str( + excinfo.value + ) + + ar1 = dpt.ones(2, dtype="float32") + ar2 = dpt.ones_like(ar1, dtype="int32") + y = dpt.empty(3) + with pytest.raises(ValueError) as excinfo: + dpt.add(ar1, ar2, out=y) + assert "The shape of input and output arrays are inconsistent" in str( + excinfo.value + ) + + ar1 = np.ones(2, dtype="float32") + ar2 = np.ones_like(ar1, dtype="int32") + with pytest.raises(dpt.ExecutionPlacementError) as excinfo: + dpt.add(ar1, ar2) + assert re.match( + "Execution placement can not be unambiguously inferred.*", + str(excinfo.value), + ) + + ar1 = dpt.ones(2, dtype="float32") + ar2 = dpt.ones_like(ar1, dtype="int32") + y = np.empty(ar1.shape, dtype=ar1.dtype) + with pytest.raises(TypeError) as excinfo: + dpt.add(ar1, ar2, out=y) + assert "output array must be of usm_ndarray type" in str(excinfo.value) + + +@pytest.mark.parametrize("dtype", _all_dtypes) +def test_add_dtype_error( + dtype, +): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + ar1 = dpt.ones(5, dtype=dtype) + ar2 = dpt.ones_like(ar1, dtype="f4") + + y = dpt.zeros_like(ar1, dtype="int8") + with pytest.raises(ValueError) as excinfo: + dpt.add(ar1, ar2, out=y) + assert re.match("Output array of type.*is needed", str(excinfo.value)) + + +@pytest.mark.parametrize("dtype", _all_dtypes) +def test_add_inplace_python_scalar(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + X = dpt.zeros((10, 10), dtype=dtype, sycl_queue=q) + dt_kind = X.dtype.kind + if dt_kind in "ui": + X += int(0) + elif dt_kind == "f": + X += float(0) + elif dt_kind == "c": + X += complex(0) + elif dt_kind == "b": + X += bool(0) + + +@pytest.mark.parametrize("op1_dtype", _all_dtypes) +@pytest.mark.parametrize("op2_dtype", _all_dtypes) +def test_add_inplace_dtype_matrix(op1_dtype, op2_dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(op1_dtype, q) + skip_if_dtype_not_supported(op2_dtype, q) + + sz = 127 + ar1 = dpt.ones(sz, dtype=op1_dtype) + ar2 = dpt.ones_like(ar1, dtype=op2_dtype) + + dev = q.sycl_device + _fp16 = dev.has_aspect_fp16 + _fp64 = dev.has_aspect_fp64 + # operators use a different Python implementation which permits + # same kind style casting + if _can_cast(ar2.dtype, ar1.dtype, _fp16, _fp64, casting="same_kind"): + ar1 += ar2 + assert ( + dpt.asnumpy(ar1) == np.full(ar1.shape, 2, dtype=ar1.dtype) + ).all() + + ar3 = dpt.ones(sz, dtype=op1_dtype)[::-1] + ar4 = dpt.ones(2 * sz, dtype=op2_dtype)[::2] + ar3 += ar4 + assert ( + dpt.asnumpy(ar3) == np.full(ar3.shape, 2, dtype=ar3.dtype) + ).all() + else: + with pytest.raises(ValueError): + ar1 += ar2 + + # here, test the special case where out is the first argument + # so an in-place kernel is used for efficiency + # this covers a specific branch in the BinaryElementwiseFunc logic + ar1 = dpt.ones(sz, dtype=op1_dtype) + ar2 = dpt.ones_like(ar1, dtype=op2_dtype) + if _can_cast(ar2.dtype, ar1.dtype, _fp16, _fp64): + dpt.add(ar1, ar2, out=ar1) + assert ( + dpt.asnumpy(ar1) == np.full(ar1.shape, 2, dtype=ar1.dtype) + ).all() + + ar3 = dpt.ones(sz, dtype=op1_dtype)[::-1] + ar4 = dpt.ones(2 * sz, dtype=op2_dtype)[::2] + dpt.add(ar3, ar4, out=ar3) + assert ( + dpt.asnumpy(ar3) == np.full(ar3.shape, 2, dtype=ar3.dtype) + ).all() + else: + with pytest.raises(ValueError): + dpt.add(ar1, ar2, out=ar1) + + ar1 = dpt.ones(sz, dtype=op1_dtype) + ar2 = dpt.ones_like(ar1, dtype=op2_dtype) + if _can_cast(ar1.dtype, ar2.dtype, _fp16, _fp64): + dpt.add(ar1, ar2, out=ar2) + assert ( + dpt.asnumpy(ar2) == np.full(ar2.shape, 2, dtype=ar2.dtype) + ).all() + + ar3 = dpt.ones(sz, dtype=op1_dtype)[::-1] + ar4 = dpt.ones(2 * sz, dtype=op2_dtype)[::2] + dpt.add(ar3, ar4, out=ar4) + assert ( + dpt.asnumpy(ar4) == np.full(ar4.shape, 2, dtype=ar4.dtype) + ).all() + else: + with pytest.raises(ValueError): + dpt.add(ar1, ar2, out=ar2) + + +def test_add_inplace_broadcasting(): + get_queue_or_skip() + + m = dpt.ones((100, 5), dtype="i4") + v = dpt.arange(5, dtype="i4") + + dpt.add(m, v, out=m) + assert (dpt.asnumpy(m) == np.arange(1, 6, dtype="i4")[np.newaxis, :]).all() + + # check case where second arg is out + dpt.add(v, m, out=m) + assert ( + dpt.asnumpy(m) == np.arange(10, dtype="i4")[np.newaxis, 1:10:2] + ).all() + + +def test_add_inplace_operator_broadcasting(): + get_queue_or_skip() + + m = dpt.ones((100, 5), dtype="i4") + v = dpt.arange(5, dtype="i4") + + m += v + assert (dpt.asnumpy(m) == np.arange(1, 6, dtype="i4")[np.newaxis, :]).all() + + +def test_add_inplace_operator_mutual_broadcast(): + get_queue_or_skip() + + x1 = dpt.ones((1, 10), dtype="i4") + x2 = dpt.ones((10, 1), dtype="i4") + + with pytest.raises(ValueError): + dpt.add._inplace_op(x1, x2) + + +def test_add_inplace_errors(): + get_queue_or_skip() + try: + gpu_queue = dpctl.SyclQueue("gpu") + except dpctl.SyclQueueCreationError: + pytest.skip("SyclQueue('gpu') failed, skipping") + try: + cpu_queue = dpctl.SyclQueue("cpu") + except dpctl.SyclQueueCreationError: + pytest.skip("SyclQueue('cpu') failed, skipping") + + ar1 = dpt.ones(2, dtype="float32", sycl_queue=gpu_queue) + ar2 = dpt.ones_like(ar1, sycl_queue=cpu_queue) + with pytest.raises(dpt.ExecutionPlacementError): + dpt.add(ar1, ar2, out=ar1) + + ar1 = dpt.ones(2, dtype="float32") + ar2 = dpt.ones(3, dtype="float32") + with pytest.raises(ValueError): + dpt.add(ar1, ar2, out=ar1) + + ar1 = np.ones(2, dtype="float32") + ar2 = dpt.ones(2, dtype="float32") + with pytest.raises(TypeError): + dpt.add(ar1, ar2, out=ar1) + + ar1 = dpt.ones(2, dtype="float32") + ar2 = {} + with pytest.raises(ValueError): + dpt.add(ar1, ar2, out=ar1) + + ar1 = dpt.ones((2, 1), dtype="float32") + ar2 = dpt.ones((1, 2), dtype="float32") + with pytest.raises(ValueError): + dpt.add(ar1, ar2, out=ar1) + + +def test_add_inplace_operator_errors(): + q1 = get_queue_or_skip() + q2 = get_queue_or_skip() + + x = dpt.ones(10, dtype="i4", sycl_queue=q1) + with pytest.raises(TypeError): + dpt.add._inplace_op(dict(), x) + + x.flags["W"] = False + with pytest.raises(ValueError): + dpt.add._inplace_op(x, 2) + + x_q1 = dpt.ones(10, dtype="i4", sycl_queue=q1) + x_q2 = dpt.ones(10, dtype="i4", sycl_queue=q2) + with pytest.raises(dpt.ExecutionPlacementError): + dpt.add._inplace_op(x_q1, x_q2) + + +def test_add_inplace_same_tensors(): + get_queue_or_skip() + + ar1 = dpt.ones(10, dtype="i4") + ar1 += ar1 + assert (dpt.asnumpy(ar1) == np.full(ar1.shape, 2, dtype="i4")).all() + + ar1 = dpt.ones(10, dtype="i4") + ar2 = dpt.ones(10, dtype="i4") + dpt.add(ar1, ar2, out=ar1) + # all ar1 vals should be 2 + assert (dpt.asnumpy(ar1) == np.full(ar1.shape, 2, dtype="i4")).all() + + dpt.add(ar2, ar1, out=ar2) + # all ar2 vals should be 3 + assert (dpt.asnumpy(ar2) == np.full(ar2.shape, 3, dtype="i4")).all() + + dpt.add(ar1, ar2, out=ar2) + # all ar2 vals should be 5 + assert (dpt.asnumpy(ar2) == np.full(ar2.shape, 5, dtype="i4")).all() + + +def test_add_str_repr(): + add_s = str(dpt.add) + assert isinstance(add_s, str) + assert "add" in add_s + + add_r = repr(dpt.add) + assert isinstance(add_r, str) + assert "add" in add_r + + +def test_add_cfd(): + q1 = get_queue_or_skip() + q2 = dpctl.SyclQueue(q1.sycl_device) + + x1 = dpt.ones(10, sycl_queue=q1) + x2 = dpt.ones(10, sycl_queue=q2) + with pytest.raises(dpt.ExecutionPlacementError): + dpt.add(x1, x2) + + with pytest.raises(dpt.ExecutionPlacementError): + dpt.add(x1, x1, out=x2) + + +def test_add_out_type_check(): + get_queue_or_skip() + + x1 = dpt.ones(10) + x2 = dpt.ones(10) + + out = range(10) + + with pytest.raises(TypeError): + dpt.add(x1, x2, out=out) + + +def test_add_out_need_temporary(): + get_queue_or_skip() + + x = dpt.ones(10, dtype="u4") + + dpt.add(x[:6], 1, out=x[-6:]) + + assert dpt.all(x[:-6] == 1) and dpt.all(x[-6:] == 2) diff --git a/dpnp/tests/tensor/elementwise/test_angle.py b/dpnp/tests/tensor/elementwise/test_angle.py new file mode 100644 index 000000000000..09dc2bfc414f --- /dev/null +++ b/dpnp/tests/tensor/elementwise/test_angle.py @@ -0,0 +1,111 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import itertools + +import numpy as np +import pytest + +import dpnp.tensor as dpt +from dpnp.tensor._type_utils import _can_cast + +from ..helper import ( + get_queue_or_skip, + skip_if_dtype_not_supported, +) +from .utils import ( + _all_dtypes, + _complex_fp_dtypes, + _no_complex_dtypes, +) + + +@pytest.mark.parametrize("dtype", _all_dtypes) +def test_angle_out_type(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + x = dpt.asarray(1, dtype=dtype, sycl_queue=q) + dt = dpt.dtype(dtype) + dev = q.sycl_device + _fp16 = dev.has_aspect_fp16 + _fp64 = dev.has_aspect_fp64 + if _can_cast(dt, dpt.complex64, _fp16, _fp64): + assert dpt.angle(x).dtype == dpt.float32 + else: + assert dpt.angle(x).dtype == dpt.float64 + + +@pytest.mark.parametrize("dtype", _no_complex_dtypes[1:]) +def test_angle_real(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + x = dpt.arange(10, dtype=dtype, sycl_queue=q) + r = dpt.angle(x) + + assert dpt.all(r == 0) + + +@pytest.mark.parametrize("dtype", _complex_fp_dtypes) +def test_angle_complex(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + tol = 8 * dpt.finfo(dtype).resolution + vals = dpt.pi * dpt.arange(10, dtype=dpt.finfo(dtype).dtype, sycl_queue=q) + + x = dpt.zeros(10, dtype=dtype, sycl_queue=q) + + x.imag[...] = vals + r = dpt.angle(x) + expected = dpt.atan2(x.imag, x.real) + assert dpt.allclose(r, expected, atol=tol, rtol=tol) + + x.real[...] += dpt.pi + r = dpt.angle(x) + expected = dpt.atan2(x.imag, x.real) + assert dpt.allclose(r, expected, atol=tol, rtol=tol) + + +@pytest.mark.parametrize("dtype", ["c8", "c16"]) +def test_angle_special_cases(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + vals = [np.nan, -np.nan, np.inf, -np.inf, +0.0, -0.0] + vals = [complex(*val) for val in itertools.product(vals, repeat=2)] + + x = dpt.asarray(vals, dtype=dtype, sycl_queue=q) + + r = dpt.angle(x) + expected = dpt.atan2(x.imag, x.real) + + tol = 8 * dpt.finfo(dtype).resolution + + assert dpt.allclose(r, expected, atol=tol, rtol=tol, equal_nan=True) diff --git a/dpnp/tests/tensor/elementwise/test_atan2.py b/dpnp/tests/tensor/elementwise/test_atan2.py new file mode 100644 index 000000000000..7a7bb92cdd7b --- /dev/null +++ b/dpnp/tests/tensor/elementwise/test_atan2.py @@ -0,0 +1,524 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import ctypes + +import numpy as np +import pytest +from numpy.testing import assert_allclose + +import dpnp.tensor as dpt + +from ..helper import ( + get_queue_or_skip, + skip_if_dtype_not_supported, +) +from .utils import ( + _compare_dtypes, + _no_complex_dtypes, +) + + +@pytest.mark.parametrize("op1_dtype", _no_complex_dtypes[1:]) +@pytest.mark.parametrize("op2_dtype", _no_complex_dtypes[1:]) +def test_atan2_dtype_matrix(op1_dtype, op2_dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(op1_dtype, q) + skip_if_dtype_not_supported(op2_dtype, q) + + sz = 127 + ar1 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q) + ar2 = dpt.ones_like(ar1, dtype=op2_dtype, sycl_queue=q) + + r = dpt.atan2(ar1, ar2) + assert isinstance(r, dpt.usm_ndarray) + expected = np.arctan2( + np.ones(sz, dtype=op1_dtype), np.ones(sz, dtype=op2_dtype) + ) + assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q) + assert r.shape == ar1.shape + + tol = 8 * max( + dpt.finfo(r.dtype).resolution, dpt.finfo(expected.dtype).resolution + ) + assert_allclose(dpt.asnumpy(r), expected, atol=tol, rtol=tol) + assert r.sycl_queue == ar1.sycl_queue + + ar3 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q) + ar4 = dpt.ones(2 * sz, dtype=op2_dtype, sycl_queue=q) + + r = dpt.atan2(ar3[::-1], ar4[::2]) + assert isinstance(r, dpt.usm_ndarray) + expected = np.arctan2( + np.ones(sz, dtype=op1_dtype), np.ones(sz, dtype=op2_dtype) + ) + assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q) + assert r.shape == ar3.shape + + tol = 8 * max( + dpt.finfo(r.dtype).resolution, dpt.finfo(expected.dtype).resolution + ) + assert_allclose(dpt.asnumpy(r), expected, atol=tol, rtol=tol) + + +@pytest.mark.parametrize("arr_dt", _no_complex_dtypes[1:]) +def test_atan2_python_scalar(arr_dt): + q = get_queue_or_skip() + skip_if_dtype_not_supported(arr_dt, q) + + X = dpt.ones((10, 10), dtype=arr_dt, sycl_queue=q) + py_ones = ( + bool(1), + int(1), + float(1), + np.float32(1), + ctypes.c_int(1), + ) + for sc in py_ones: + R = dpt.atan2(X, sc) + assert isinstance(R, dpt.usm_ndarray) + R = dpt.atan2(sc, X) + assert isinstance(R, dpt.usm_ndarray) + + +@pytest.mark.parametrize("dt", ["f2", "f4", "f8"]) +def test_atan2_special_case_one_nan(dt): + """If either x1_i or x2_i is NaN, the result is NaN.""" + q = get_queue_or_skip() + skip_if_dtype_not_supported(dt, q) + + x1 = dpt.asarray([dpt.nan, dpt.nan, 1], dtype=dt) + x2 = dpt.asarray([dpt.nan, 1, dpt.nan], dtype=dt) + + y = dpt.atan2(x1, x2) + assert dpt.all(dpt.isnan(y)) + + +@pytest.mark.parametrize("dt", ["f2", "f4", "f8"]) +def test_atan2_special_case_positive_and_pzero(dt): + """If x1_i is greater than 0 and x2_i is +0, the result + is an approximation to +pi/2. + """ + q = get_queue_or_skip() + skip_if_dtype_not_supported(dt, q) + + x1 = dpt.asarray([0.5, 1, 2, dpt.inf], dtype=dt) + x2 = dpt.asarray([+0.0], dtype=dt) + + actual = dpt.atan2(x1, x2) + expected = dpt.asarray(dpt.pi / 2, dtype=dt) + + diff = dpt.abs(dpt.subtract(actual, expected)) + atol = 8 * dpt.finfo(diff.dtype).eps + assert dpt.all(dpt.less_equal(diff, atol)) + + +@pytest.mark.parametrize("dt", ["f2", "f4", "f8"]) +def test_atan2_special_case_positive_and_nzero(dt): + """If x1_i is greater than 0 and x2_i is -0, the result + is an approximation to +pi/2. + """ + q = get_queue_or_skip() + skip_if_dtype_not_supported(dt, q) + + x1 = dpt.asarray([0.5, 1, 2, dpt.inf], dtype=dt) + x2 = dpt.asarray([-0.0], dtype=dt) + + actual = dpt.atan2(x1, x2) + expected = dpt.asarray(dpt.pi / 2, dtype=dt) + + diff = dpt.abs(dpt.subtract(actual, expected)) + atol = 8 * dpt.finfo(diff.dtype).eps + assert dpt.all(dpt.less_equal(diff, atol)) + + +@pytest.mark.parametrize("dt", ["f2", "f4", "f8"]) +def test_atan2_special_case_pzero_and_positive(dt): + """If x1_i is +0 and x2_i is greater than 0, + the result is +0. + """ + q = get_queue_or_skip() + skip_if_dtype_not_supported(dt, q) + + x1 = dpt.asarray(+0.0, dtype=dt) + x2 = dpt.asarray([0.5, 1, 2, dpt.inf], dtype=dt) + + actual = dpt.atan2(x1, x2) + expected = dpt.asarray(+0.0, dtype=dt) + + assert dpt.all(dpt.equal(actual, expected)) + assert not dpt.any(dpt.signbit(actual)) + + +@pytest.mark.parametrize("dt", ["f2", "f4", "f8"]) +def test_atan2_special_case_pzero_and_pzero(dt): + """If x1_i is +0 and x2_i is +0, the result is +0.""" + q = get_queue_or_skip() + skip_if_dtype_not_supported(dt, q) + + x1 = dpt.asarray(+0.0, dtype=dt) + x2 = dpt.asarray([+0.0], dtype=dt) + + actual = dpt.atan2(x1, x2) + expected = dpt.asarray(+0.0, dtype=dt) + + assert dpt.all(dpt.equal(actual, expected)) + assert not dpt.any(dpt.signbit(actual)) + + +@pytest.mark.parametrize("dt", ["f2", "f4", "f8"]) +def test_atan2_special_case_pzero_and_nzero(dt): + """ + If x1_i is +0 and x2_i is -0, the result is an + approximation to +pi. + """ + q = get_queue_or_skip() + skip_if_dtype_not_supported(dt, q) + + x1 = dpt.asarray(+0.0, dtype=dt) + x2 = dpt.asarray([-0.0], dtype=dt) + + actual = dpt.atan2(x1, x2) + expected = dpt.asarray(dpt.pi, dtype=dt) + + diff = dpt.abs(dpt.subtract(actual, expected)) + atol = 8 * dpt.finfo(diff.dtype).eps + assert dpt.all(dpt.less_equal(diff, atol)) + + +@pytest.mark.parametrize("dt", ["f2", "f4", "f8"]) +def test_atan2_special_case_pzero_and_negatvie(dt): + """ + If x1_i is +0 and x2_i is less than 0, the result + is an approximation to +pi. + """ + q = get_queue_or_skip() + skip_if_dtype_not_supported(dt, q) + + x1 = dpt.asarray(+0.0, dtype=dt) + x2 = dpt.asarray([-0.5, -1, -2, -dpt.inf], dtype=dt) + + actual = dpt.atan2(x1, x2) + expected = dpt.asarray(dpt.pi, dtype=dt) + + diff = dpt.abs(dpt.subtract(actual, expected)) + atol = 8 * dpt.finfo(diff.dtype).eps + assert dpt.all(dpt.less_equal(diff, atol)) + + +@pytest.mark.parametrize("dt", ["f2", "f4", "f8"]) +def test_atan2_special_case_nzero_and_positive(dt): + """If x1_i is -0 and x2_i is greater than 0, + the result is -0. + """ + q = get_queue_or_skip() + skip_if_dtype_not_supported(dt, q) + + x1 = dpt.asarray(-0.0, dtype=dt) + x2 = dpt.asarray([0.5, 1, 2, dpt.inf], dtype=dt) + + actual = dpt.atan2(x1, x2) + expected = dpt.asarray(-0.0, dtype=dt) + + assert dpt.all(dpt.equal(actual, expected)) + assert dpt.all(dpt.signbit(actual)) + + +@pytest.mark.parametrize("dt", ["f2", "f4", "f8"]) +def test_atan2_special_case_nzero_and_pzero(dt): + """If x1_i is -0 and x2_i is +0, the result is -0.""" + q = get_queue_or_skip() + skip_if_dtype_not_supported(dt, q) + + x1 = dpt.asarray(-0.0, dtype=dt) + x2 = dpt.asarray([+0.0], dtype=dt) + + actual = dpt.atan2(x1, x2) + expected = dpt.asarray(-0.0, dtype=dt) + + assert dpt.all(dpt.equal(actual, expected)) + assert dpt.all(dpt.signbit(actual)) + + +@pytest.mark.parametrize("dt", ["f2", "f4", "f8"]) +def test_atan2_special_case_nzero_and_nzero(dt): + """If x1_i is -0 and x2_i is -0, the result is + an approximation to -pi. + """ + q = get_queue_or_skip() + skip_if_dtype_not_supported(dt, q) + + x1 = dpt.asarray([-0.0], dtype=dt) + x2 = dpt.asarray([-0.0], dtype=dt) + + actual = dpt.atan2(x1, x2) + expected = dpt.asarray(-dpt.pi, dtype=dt) + + diff = dpt.abs(dpt.subtract(actual, expected)) + atol = 8 * dpt.finfo(diff.dtype).eps + assert dpt.all(dpt.less_equal(diff, atol)) + + +@pytest.mark.parametrize("dt", ["f2", "f4", "f8"]) +def test_atan2_special_case_nzero_and_negative(dt): + """If x1_i is -0 and x2_i is less than 0, the result + is an approximation to -pi. + """ + q = get_queue_or_skip() + skip_if_dtype_not_supported(dt, q) + + x1 = dpt.asarray([-0.0], dtype=dt) + x2 = dpt.asarray([-dpt.inf, -2, -1, -0.5], dtype=dt) + + actual = dpt.atan2(x1, x2) + expected = dpt.asarray(-dpt.pi, dtype=dt) + + diff = dpt.abs(dpt.subtract(actual, expected)) + atol = 8 * dpt.finfo(diff.dtype).eps + assert dpt.all(dpt.less_equal(diff, atol)) + + +@pytest.mark.parametrize("dt", ["f2", "f4", "f8"]) +def test_atan2_special_case_negative_and_pzero(dt): + """If x1_i is less than 0 and x2_i is +0, the result + is an approximation to -pi/2. + """ + q = get_queue_or_skip() + skip_if_dtype_not_supported(dt, q) + + x1 = dpt.asarray([-dpt.inf, -2, -1, -0.5], dtype=dt) + x2 = dpt.asarray(+0.0, dtype=dt) + + actual = dpt.atan2(x1, x2) + expected = dpt.asarray(-dpt.pi / 2, dtype=dt) + + diff = dpt.abs(dpt.subtract(actual, expected)) + atol = 8 * dpt.finfo(diff.dtype).eps + assert dpt.all(dpt.less_equal(diff, atol)) + + +@pytest.mark.parametrize("dt", ["f2", "f4", "f8"]) +def test_atan2_special_case_negative_and_nzero(dt): + """If x1_i is less than 0 and x2_i is -0, the result + is an approximation to -pi/2.""" + q = get_queue_or_skip() + skip_if_dtype_not_supported(dt, q) + + x1 = dpt.asarray([-dpt.inf, -2, -1, -0.5], dtype=dt) + x2 = dpt.asarray(-0.0, dtype=dt) + + actual = dpt.atan2(x1, x2) + expected = dpt.asarray(-dpt.pi / 2, dtype=dt) + + diff = dpt.abs(dpt.subtract(actual, expected)) + atol = 8 * dpt.finfo(diff.dtype).eps + assert dpt.all(dpt.less_equal(diff, atol)) + + +@pytest.mark.parametrize("dt", ["f2", "f4", "f8"]) +def test_atan2_special_case_pfinite_and_pinf(dt): + """If x1_i is greater than 0, x1_i is a finite number, + and x2_i is +infinity, the result is +0.""" + q = get_queue_or_skip() + skip_if_dtype_not_supported(dt, q) + + x1 = dpt.asarray([0.5, 1, 2, 5], dtype=dt) + x2 = dpt.asarray(dpt.inf, dtype=dt) + + actual = dpt.atan2(x1, x2) + expected = dpt.asarray(+0.0, dtype=dt) + assert dpt.all(dpt.equal(actual, expected)) + assert not dpt.any(dpt.signbit(actual)) + + +@pytest.mark.parametrize("dt", ["f2", "f4", "f8"]) +def test_atan2_special_case_pfinite_and_ninf(dt): + """If x1_i is greater than 0, x1_i is a finite number, + and x2_i is -infinity, the result is an approximation + to +pi.""" + q = get_queue_or_skip() + skip_if_dtype_not_supported(dt, q) + + x1 = dpt.asarray([0.5, 1, 2, 5], dtype=dt) + x2 = dpt.asarray(-dpt.inf, dtype=dt) + + actual = dpt.atan2(x1, x2) + expected = dpt.asarray(dpt.pi, dtype=dt) + + diff = dpt.abs(dpt.subtract(actual, expected)) + atol = 8 * dpt.finfo(diff.dtype).eps + assert dpt.all(dpt.less_equal(diff, atol)) + + +@pytest.mark.parametrize("dt", ["f2", "f4", "f8"]) +def test_atan2_special_case_nfinite_and_pinf(dt): + """If x1_i is less than 0, x1_i is a finite number, + and x2_i is +infinity, the result is -0.""" + q = get_queue_or_skip() + skip_if_dtype_not_supported(dt, q) + + x1 = dpt.asarray([-0.5, -1, -2, -5], dtype=dt) + x2 = dpt.asarray(dpt.inf, dtype=dt) + + actual = dpt.atan2(x1, x2) + expected = dpt.asarray(-0.0, dtype=dt) + assert dpt.all(dpt.equal(actual, expected)) + assert dpt.all(dpt.signbit(actual)) + + +@pytest.mark.parametrize("dt", ["f2", "f4", "f8"]) +def test_atan2_special_case_nfinite_and_ninf(dt): + """If x1_i is less than 0, x1_i is a finite number, and + x2_i is -infinity, the result is an approximation + to -pi.""" + q = get_queue_or_skip() + skip_if_dtype_not_supported(dt, q) + + x1 = dpt.asarray([-0.5, -1, -2, -5], dtype=dt) + x2 = dpt.asarray(-dpt.inf, dtype=dt) + + actual = dpt.atan2(x1, x2) + expected = dpt.asarray(-dpt.pi, dtype=dt) + + diff = dpt.abs(dpt.subtract(actual, expected)) + atol = 8 * dpt.finfo(diff.dtype).eps + assert dpt.all(dpt.less_equal(diff, atol)) + + +@pytest.mark.parametrize("dt", ["f2", "f4", "f8"]) +def test_atan2_special_case_pinf_and_finite(dt): + """If x1_i is +infinity and x2_i is a finite number, + the result is an approximation to +pi/2. + """ + q = get_queue_or_skip() + skip_if_dtype_not_supported(dt, q) + + x1 = dpt.asarray(dpt.inf, dtype=dt) + x2 = dpt.asarray([-2, -0.0, 0.0, 2], dtype=dt) + + actual = dpt.atan2(x1, x2) + expected = dpt.asarray(dpt.pi / 2, dtype=dt) + + diff = dpt.abs(dpt.subtract(actual, expected)) + atol = 8 * dpt.finfo(diff.dtype).eps + assert dpt.all(dpt.less_equal(diff, atol)) + + +@pytest.mark.parametrize("dt", ["f2", "f4", "f8"]) +def test_atan2_special_case_ninf_and_finite(dt): + """If x1_i is -infinity and x2_i is a finite number, + the result is an approximation to -pi/2. + """ + q = get_queue_or_skip() + skip_if_dtype_not_supported(dt, q) + + x1 = dpt.asarray(-dpt.inf, dtype=dt) + x2 = dpt.asarray([-2, -0.0, 0.0, 2], dtype=dt) + + actual = dpt.atan2(x1, x2) + expected = dpt.asarray(-dpt.pi / 2, dtype=dt) + + diff = dpt.abs(dpt.subtract(actual, expected)) + atol = 8 * dpt.finfo(diff.dtype).eps + assert dpt.all(dpt.less_equal(diff, atol)) + + +@pytest.mark.parametrize("dt", ["f2", "f4", "f8"]) +def test_atan2_special_case_pinf_and_pinf(dt): + """If x1_i is +infinity and x2_i is +infinity, + the result is an approximation to +pi/4. + """ + q = get_queue_or_skip() + skip_if_dtype_not_supported(dt, q) + + x1 = dpt.asarray(dpt.inf, dtype=dt) + x2 = dpt.asarray([dpt.inf], dtype=dt) + + actual = dpt.atan2(x1, x2) + expected = dpt.asarray(dpt.pi / 4, dtype=dt) + + diff = dpt.abs(dpt.subtract(actual, expected)) + atol = 8 * dpt.finfo(diff.dtype).eps + assert dpt.all(dpt.less_equal(diff, atol)) + + +@pytest.mark.parametrize("dt", ["f2", "f4", "f8"]) +def test_atan2_special_case_pinf_and_ninf(dt): + """If x1_i is +infinity and x2_i is -infinity, + the result is an approximation to +3*pi/4. + """ + q = get_queue_or_skip() + skip_if_dtype_not_supported(dt, q) + + x1 = dpt.asarray(dpt.inf, dtype=dt) + x2 = dpt.asarray([-dpt.inf], dtype=dt) + + actual = dpt.atan2(x1, x2) + expected = dpt.asarray(3 * dpt.pi / 4, dtype=dt) + + diff = dpt.abs(dpt.subtract(actual, expected)) + atol = 8 * dpt.finfo(diff.dtype).eps + assert dpt.all(dpt.less_equal(diff, atol)) + + +@pytest.mark.parametrize("dt", ["f2", "f4", "f8"]) +def test_atan2_special_case_ninf_and_pinf(dt): + """If x1_i is -infinity and x2_i is +infinity, + the result is an approximation to -pi/4. + """ + q = get_queue_or_skip() + skip_if_dtype_not_supported(dt, q) + + x1 = dpt.asarray(-dpt.inf, dtype=dt) + x2 = dpt.asarray([dpt.inf], dtype=dt) + + actual = dpt.atan2(x1, x2) + expected = dpt.asarray(-dpt.pi / 4, dtype=dt) + + diff = dpt.abs(dpt.subtract(actual, expected)) + atol = 8 * dpt.finfo(diff.dtype).eps + assert dpt.all(dpt.less_equal(diff, atol)) + + +@pytest.mark.parametrize("dt", ["f2", "f4", "f8"]) +def test_atan2_special_case_ninf_and_ninf(dt): + """If x1_i is -infinity and x2_i is -infinity, + the result is an approximation to -3*pi/4. + """ + q = get_queue_or_skip() + skip_if_dtype_not_supported(dt, q) + + x1 = dpt.asarray(-dpt.inf, dtype=dt) + x2 = dpt.asarray([-dpt.inf], dtype=dt) + + actual = dpt.atan2(x1, x2) + expected = dpt.asarray(-3 * dpt.pi / 4, dtype=dt) + + diff = dpt.abs(dpt.subtract(actual, expected)) + atol = 8 * dpt.finfo(diff.dtype).eps + assert dpt.all(dpt.less_equal(diff, atol)) diff --git a/dpnp/tests/tensor/elementwise/test_bitwise_and.py b/dpnp/tests/tensor/elementwise/test_bitwise_and.py new file mode 100644 index 000000000000..c9172cb9d7d6 --- /dev/null +++ b/dpnp/tests/tensor/elementwise/test_bitwise_and.py @@ -0,0 +1,142 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import numpy as np +import pytest + +import dpnp.tensor as dpt +from dpnp.tensor._type_utils import _can_cast + +from ..helper import ( + get_queue_or_skip, + skip_if_dtype_not_supported, +) +from .utils import _integral_dtypes + + +@pytest.mark.parametrize("op_dtype", _integral_dtypes) +def test_bitwise_and_dtype_matrix_contig(op_dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(op_dtype, q) + + sz = 7 + n = 2 * sz + dt1 = dpt.dtype(op_dtype) + dt2 = dpt.dtype(op_dtype) + + x1_range_begin = -sz if dpt.iinfo(dt1).min < 0 else 0 + x1 = dpt.arange(x1_range_begin, x1_range_begin + n, dtype=dt1) + + x2_range_begin = -sz if dpt.iinfo(dt2).min < 0 else 0 + x2 = dpt.arange(x2_range_begin, x2_range_begin + n, dtype=dt1) + + r = dpt.bitwise_and(x1, x2) + assert isinstance(r, dpt.usm_ndarray) + + x1_np = np.arange(x1_range_begin, x1_range_begin + n, dtype=op_dtype) + x2_np = np.arange(x2_range_begin, x2_range_begin + n, dtype=op_dtype) + r_np = np.bitwise_and(x1_np, x2_np) + + assert (r_np == dpt.asnumpy(r)).all() + + +@pytest.mark.parametrize("op_dtype", _integral_dtypes) +def test_bitwise_and_dtype_matrix_strided(op_dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(op_dtype, q) + + sz = 11 + n = 2 * sz + dt1 = dpt.dtype(op_dtype) + dt2 = dpt.dtype(op_dtype) + + x1_range_begin = -sz if dpt.iinfo(dt1).min < 0 else 0 + x1 = dpt.arange(x1_range_begin, x1_range_begin + n, dtype=dt1)[::2] + + x2_range_begin = -(sz // 2) if dpt.iinfo(dt2).min < 0 else 0 + x2 = dpt.arange(x2_range_begin, x2_range_begin + n, dtype=dt1)[::-2] + + r = dpt.bitwise_and(x1, x2) + assert isinstance(r, dpt.usm_ndarray) + + x1_np = np.arange(x1_range_begin, x1_range_begin + n, dtype=op_dtype)[::2] + x2_np = np.arange(x2_range_begin, x2_range_begin + n, dtype=op_dtype)[::-2] + r_np = np.bitwise_and(x1_np, x2_np) + + assert (r_np == dpt.asnumpy(r)).all() + + +def test_bitwise_and_bool(): + get_queue_or_skip() + + x1 = dpt.asarray([True, False]) + x2 = dpt.asarray([False, True]) + + r_bw = dpt.bitwise_and(x1[:, dpt.newaxis], x2[dpt.newaxis]) + r_lo = dpt.logical_and(x1[:, dpt.newaxis], x2[dpt.newaxis]) + + assert dpt.all(dpt.equal(r_bw, r_lo)) + + +@pytest.mark.parametrize("dtype", ["?"] + _integral_dtypes) +def test_bitwise_and_inplace_python_scalar(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + X = dpt.zeros((10, 10), dtype=dtype, sycl_queue=q) + dt_kind = X.dtype.kind + if dt_kind == "b": + X &= False + else: + X &= int(0) + + +@pytest.mark.parametrize("op1_dtype", ["?"] + _integral_dtypes) +@pytest.mark.parametrize("op2_dtype", ["?"] + _integral_dtypes) +def test_bitwise_and_inplace_dtype_matrix(op1_dtype, op2_dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(op1_dtype, q) + skip_if_dtype_not_supported(op2_dtype, q) + + sz = 127 + ar1 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q) + ar2 = dpt.ones_like(ar1, dtype=op2_dtype, sycl_queue=q) + + dev = q.sycl_device + _fp16 = dev.has_aspect_fp16 + _fp64 = dev.has_aspect_fp64 + if _can_cast(ar2.dtype, ar1.dtype, _fp16, _fp64, casting="same_kind"): + ar1 &= ar2 + assert dpt.all(ar1 == 1) + + ar3 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)[::-1] + ar4 = dpt.ones(2 * sz, dtype=op2_dtype, sycl_queue=q)[::2] + ar3 &= ar4 + assert dpt.all(ar3 == 1) + else: + with pytest.raises(ValueError): + ar1 &= ar2 diff --git a/dpnp/tests/tensor/elementwise/test_bitwise_invert.py b/dpnp/tests/tensor/elementwise/test_bitwise_invert.py new file mode 100644 index 000000000000..2b7a7c3a6f93 --- /dev/null +++ b/dpnp/tests/tensor/elementwise/test_bitwise_invert.py @@ -0,0 +1,148 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import numpy as np +import pytest + +import dpnp.tensor as dpt + +from ..helper import ( + get_queue_or_skip, + skip_if_dtype_not_supported, +) +from .utils import ( + _compare_dtypes, + _integral_dtypes, + _usm_types, +) + + +@pytest.mark.parametrize( + "op_dtype", + [ + "b1", + ] + + _integral_dtypes, +) +def test_bitwise_invert_dtype_matrix(op_dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(op_dtype, q) + + sz = 7 + ar1 = dpt.asarray(np.random.randint(0, 2, sz), dtype=op_dtype) + + r = dpt.bitwise_invert(ar1) + assert isinstance(r, dpt.usm_ndarray) + assert r.dtype == ar1.dtype + + expected = np.bitwise_not(dpt.asnumpy(ar1)) + assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q) + assert r.shape == ar1.shape + assert (dpt.asnumpy(r) == expected).all() + assert r.sycl_queue == ar1.sycl_queue + + r2 = dpt.empty_like(r, dtype=r.dtype) + dpt.bitwise_invert(ar1, out=r2) + assert dpt.all(dpt.equal(r, r2)) + + ar2 = dpt.zeros(sz, dtype=op_dtype) + r = dpt.bitwise_invert(ar2[::-1]) + assert isinstance(r, dpt.usm_ndarray) + + expected = np.bitwise_not(np.zeros(ar2.shape, dtype=op_dtype)) + assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q) + assert r.shape == ar2.shape + assert (dpt.asnumpy(r) == expected).all() + + ar3 = dpt.ones(sz, dtype=op_dtype) + r2 = dpt.bitwise_invert(ar3[::2]) + assert isinstance(r, dpt.usm_ndarray) + + expected = np.bitwise_not(np.ones(ar3.shape, dtype=op_dtype)[::2]) + assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q) + assert (dpt.asnumpy(r2) == expected).all() + + r3 = dpt.empty_like(r, dtype=r.dtype) + dpt.bitwise_invert(ar2[::-1], out=r3) + assert dpt.all(dpt.equal(r, r3)) + + +@pytest.mark.parametrize("op_usm_type", _usm_types) +def test_bitwise_invert_usm_type_matrix(op_usm_type): + get_queue_or_skip() + + sz = 128 + ar1 = dpt.asarray( + np.random.randint(0, 2, sz), dtype="i4", usm_type=op_usm_type + ) + + r = dpt.bitwise_invert(ar1) + assert isinstance(r, dpt.usm_ndarray) + assert r.usm_type == op_usm_type + + +def test_bitwise_invert_order(): + get_queue_or_skip() + + ar1 = dpt.ones((20, 20), dtype="i4", order="C") + r1 = dpt.bitwise_invert(ar1, order="C") + assert r1.flags.c_contiguous + r2 = dpt.bitwise_invert(ar1, order="F") + assert r2.flags.f_contiguous + r3 = dpt.bitwise_invert(ar1, order="A") + assert r3.flags.c_contiguous + r4 = dpt.bitwise_invert(ar1, order="K") + assert r4.flags.c_contiguous + + ar1 = dpt.zeros((20, 20), dtype="i4", order="F") + r1 = dpt.bitwise_invert(ar1, order="C") + assert r1.flags.c_contiguous + r2 = dpt.bitwise_invert(ar1, order="F") + assert r2.flags.f_contiguous + r3 = dpt.bitwise_invert(ar1, order="A") + assert r3.flags.f_contiguous + r4 = dpt.bitwise_invert(ar1, order="K") + assert r4.flags.f_contiguous + + ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2] + r4 = dpt.bitwise_invert(ar1, order="K") + assert r4.strides == (20, -1) + + ar1 = dpt.zeros((40, 40), dtype="i4", order="C")[:20, ::-2].mT + r4 = dpt.bitwise_invert(ar1, order="K") + assert r4.strides == (-1, 20) + + +def test_bitwise_invert_large_boolean(): + get_queue_or_skip() + + x = dpt.tril(dpt.ones((32, 32), dtype="?"), k=-1) + res = dpt.astype(dpt.bitwise_invert(x), "i4") + + assert dpt.all(res >= 0) + assert dpt.all(res <= 1) diff --git a/dpnp/tests/tensor/elementwise/test_bitwise_left_shift.py b/dpnp/tests/tensor/elementwise/test_bitwise_left_shift.py new file mode 100644 index 000000000000..bb68aab227ab --- /dev/null +++ b/dpnp/tests/tensor/elementwise/test_bitwise_left_shift.py @@ -0,0 +1,150 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import numpy as np +import pytest + +import dpnp.tensor as dpt +from dpnp.tensor._type_utils import _can_cast + +from ..helper import ( + get_queue_or_skip, + skip_if_dtype_not_supported, +) +from .utils import _integral_dtypes + + +@pytest.mark.parametrize("op1_dtype", _integral_dtypes) +@pytest.mark.parametrize("op2_dtype", _integral_dtypes) +def test_bitwise_left_shift_dtype_matrix_contig(op1_dtype, op2_dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(op1_dtype, q) + skip_if_dtype_not_supported(op2_dtype, q) + + if op1_dtype != op2_dtype and "u8" in [op1_dtype, op2_dtype]: + return + + sz = 7 + n = 2 * sz + dt1 = dpt.dtype(op1_dtype) + dt2 = dpt.dtype(op2_dtype) + + x1_range_begin = -sz if dpt.iinfo(dt1).min < 0 else 0 + x1 = dpt.arange(x1_range_begin, x1_range_begin + n, dtype=dt1) + x2 = dpt.arange(0, n, dtype=dt2) + + r = dpt.bitwise_left_shift(x1, x2) + assert isinstance(r, dpt.usm_ndarray) + assert r.sycl_queue == x1.sycl_queue + assert r.sycl_queue == x2.sycl_queue + + x1_np = np.arange(x1_range_begin, x1_range_begin + n, dtype=op1_dtype) + x2_np = np.arange(0, n, dtype=op2_dtype) + r_np = np.left_shift(x1_np, x2_np) + + assert r.dtype == r_np.dtype + assert (dpt.asnumpy(r) == r_np).all() + + +@pytest.mark.parametrize("op1_dtype", _integral_dtypes) +@pytest.mark.parametrize("op2_dtype", _integral_dtypes) +def test_bitwise_left_shift_dtype_matrix_strided(op1_dtype, op2_dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(op1_dtype, q) + skip_if_dtype_not_supported(op2_dtype, q) + + if op1_dtype != op2_dtype and "u8" in [op1_dtype, op2_dtype]: + return + + sz = 11 + n = 2 * sz + dt1 = dpt.dtype(op1_dtype) + dt2 = dpt.dtype(op2_dtype) + + x1_range_begin = -sz if dpt.iinfo(dt1).min < 0 else 0 + x1 = dpt.arange(x1_range_begin, x1_range_begin + n, dtype=dt1)[::-2] + x2 = dpt.arange(0, n, dtype=dt2)[::2] + + r = dpt.bitwise_left_shift(x1, x2) + assert isinstance(r, dpt.usm_ndarray) + assert r.sycl_queue == x1.sycl_queue + assert r.sycl_queue == x2.sycl_queue + + x1_np = np.arange(x1_range_begin, x1_range_begin + n, dtype=dt1)[::-2] + x2_np = np.arange(0, n, dtype=dt2)[::2] + r_np = np.left_shift(x1_np, x2_np) + + assert r.dtype == r_np.dtype + assert (dpt.asnumpy(r) == r_np).all() + + +@pytest.mark.parametrize("op_dtype", _integral_dtypes) +def test_bitwise_left_shift_range(op_dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(op_dtype, q) + + x = dpt.ones(255, dtype=op_dtype) + y = dpt.asarray(64, dtype=op_dtype) + + z = dpt.bitwise_left_shift(x, y) + assert dpt.all(dpt.equal(z, 0)) + + +@pytest.mark.parametrize("dtype", _integral_dtypes) +def test_bitwise_left_shift_inplace_python_scalar(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + X = dpt.zeros((10, 10), dtype=dtype, sycl_queue=q) + X <<= int(0) + + +@pytest.mark.parametrize("op1_dtype", _integral_dtypes) +@pytest.mark.parametrize("op2_dtype", _integral_dtypes) +def test_bitwise_left_shift_inplace_dtype_matrix(op1_dtype, op2_dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(op1_dtype, q) + skip_if_dtype_not_supported(op2_dtype, q) + + sz = 127 + ar1 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q) + ar2 = dpt.ones_like(ar1, dtype=op2_dtype, sycl_queue=q) + + dev = q.sycl_device + _fp16 = dev.has_aspect_fp16 + _fp64 = dev.has_aspect_fp64 + if _can_cast(ar2.dtype, ar1.dtype, _fp16, _fp64, casting="same_kind"): + ar1 <<= ar2 + assert dpt.all(ar1 == 2) + + ar3 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)[::-1] + ar4 = dpt.ones(2 * sz, dtype=op2_dtype, sycl_queue=q)[::2] + ar3 <<= ar4 + assert dpt.all(ar3 == 2) + else: + with pytest.raises(ValueError): + ar1 <<= ar2 diff --git a/dpnp/tests/tensor/elementwise/test_bitwise_or.py b/dpnp/tests/tensor/elementwise/test_bitwise_or.py new file mode 100644 index 000000000000..0e1a5bfeab1c --- /dev/null +++ b/dpnp/tests/tensor/elementwise/test_bitwise_or.py @@ -0,0 +1,158 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import numpy as np +import pytest + +import dpnp.tensor as dpt +from dpnp.tensor._type_utils import _can_cast + +from ..helper import ( + get_queue_or_skip, + skip_if_dtype_not_supported, +) +from .utils import _integral_dtypes + + +@pytest.mark.parametrize("op_dtype", _integral_dtypes) +def test_bitwise_or_dtype_matrix_contig(op_dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(op_dtype, q) + + sz = 7 + n = 2 * sz + dt1 = dpt.dtype(op_dtype) + dt2 = dpt.dtype(op_dtype) + + x1_range_begin = -sz if dpt.iinfo(dt1).min < 0 else 0 + x1 = dpt.arange(x1_range_begin, x1_range_begin + n, dtype=dt1) + + x2_range_begin = -sz if dpt.iinfo(dt2).min < 0 else 0 + x2 = dpt.arange(x2_range_begin, x2_range_begin + n, dtype=dt1) + + r = dpt.bitwise_or(x1, x2) + assert isinstance(r, dpt.usm_ndarray) + + x1_np = np.arange(x1_range_begin, x1_range_begin + n, dtype=op_dtype) + x2_np = np.arange(x2_range_begin, x2_range_begin + n, dtype=op_dtype) + r_np = np.bitwise_or(x1_np, x2_np) + + assert (r_np == dpt.asnumpy(r)).all() + + +@pytest.mark.parametrize("op_dtype", _integral_dtypes) +def test_bitwise_or_dtype_matrix_strided(op_dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(op_dtype, q) + + sz = 11 + n = 2 * sz + dt1 = dpt.dtype(op_dtype) + dt2 = dpt.dtype(op_dtype) + + x1_range_begin = -sz if dpt.iinfo(dt1).min < 0 else 0 + x1 = dpt.arange(x1_range_begin, x1_range_begin + n, dtype=dt1)[::2] + + x2_range_begin = -(sz // 2) if dpt.iinfo(dt2).min < 0 else 0 + x2 = dpt.arange(x2_range_begin, x2_range_begin + n, dtype=dt1)[::-2] + + r = dpt.bitwise_or(x1, x2) + assert isinstance(r, dpt.usm_ndarray) + + x1_np = np.arange(x1_range_begin, x1_range_begin + n, dtype=op_dtype)[::2] + x2_np = np.arange(x2_range_begin, x2_range_begin + n, dtype=op_dtype)[::-2] + r_np = np.bitwise_or(x1_np, x2_np) + + assert (r_np == dpt.asnumpy(r)).all() + + +def test_bitwise_or_bool(): + get_queue_or_skip() + + x1 = dpt.asarray([True, False]) + x2 = dpt.asarray([False, True]) + + r_bw = dpt.bitwise_or(x1[:, dpt.newaxis], x2[dpt.newaxis]) + r_lo = dpt.logical_or(x1[:, dpt.newaxis], x2[dpt.newaxis]) + + assert dpt.all(dpt.equal(r_bw, r_lo)) + + +@pytest.mark.parametrize("dtype", ["?"] + _integral_dtypes) +def test_bitwise_or_inplace_python_scalar(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + X = dpt.zeros((10, 10), dtype=dtype, sycl_queue=q) + dt_kind = X.dtype.kind + if dt_kind == "b": + X |= False + else: + X |= int(0) + + +@pytest.mark.parametrize("op1_dtype", ["?"] + _integral_dtypes) +@pytest.mark.parametrize("op2_dtype", ["?"] + _integral_dtypes) +def test_bitwise_or_inplace_dtype_matrix(op1_dtype, op2_dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(op1_dtype, q) + skip_if_dtype_not_supported(op2_dtype, q) + + sz = 127 + ar1 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q) + ar2 = dpt.ones_like(ar1, dtype=op2_dtype, sycl_queue=q) + + dev = q.sycl_device + _fp16 = dev.has_aspect_fp16 + _fp64 = dev.has_aspect_fp64 + if _can_cast(ar2.dtype, ar1.dtype, _fp16, _fp64, casting="same_kind"): + ar1 |= ar2 + assert dpt.all(ar1 == 1) + + ar3 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)[::-1] + ar4 = dpt.ones(2 * sz, dtype=op2_dtype, sycl_queue=q)[::2] + ar3 |= ar4 + assert dpt.all(ar3 == 1) + else: + with pytest.raises(ValueError): + ar1 |= ar2 + dpt.bitwise_or(ar1, ar2, out=ar1) + + # out is second arg + ar1 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q) + ar2 = dpt.ones_like(ar1, dtype=op2_dtype, sycl_queue=q) + if _can_cast(ar1.dtype, ar2.dtype, _fp16, _fp64): + dpt.bitwise_or(ar1, ar2, out=ar2) + assert dpt.all(ar2 == 1) + + ar3 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)[::-1] + ar4 = dpt.ones(2 * sz, dtype=op2_dtype, sycl_queue=q)[::2] + dpt.bitwise_or(ar3, ar4, out=ar4) + dpt.all(ar4 == 1) + else: + with pytest.raises(ValueError): + dpt.bitwise_or(ar1, ar2, out=ar2) diff --git a/dpnp/tests/tensor/elementwise/test_bitwise_right_shift.py b/dpnp/tests/tensor/elementwise/test_bitwise_right_shift.py new file mode 100644 index 000000000000..cdd2da9ba863 --- /dev/null +++ b/dpnp/tests/tensor/elementwise/test_bitwise_right_shift.py @@ -0,0 +1,166 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import numpy as np +import pytest + +import dpnp.tensor as dpt +from dpnp.tensor._type_utils import _can_cast + +from ..helper import ( + get_queue_or_skip, + skip_if_dtype_not_supported, +) +from .utils import _integral_dtypes + + +@pytest.mark.parametrize("op1_dtype", _integral_dtypes) +@pytest.mark.parametrize("op2_dtype", _integral_dtypes) +def test_bitwise_right_shift_dtype_matrix_contig(op1_dtype, op2_dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(op1_dtype, q) + skip_if_dtype_not_supported(op2_dtype, q) + + if op1_dtype != op2_dtype and "u8" in [op1_dtype, op2_dtype]: + return + + sz = 7 + n = 2 * sz + dt1 = dpt.dtype(op1_dtype) + dt2 = dpt.dtype(op2_dtype) + + x1_range_begin = -sz if dpt.iinfo(dt1).min < 0 else 0 + x1 = dpt.arange(x1_range_begin, x1_range_begin + n, dtype=dt1) + x2 = dpt.arange(0, n, dtype=dt2) + + r = dpt.bitwise_right_shift(x1, x2) + assert isinstance(r, dpt.usm_ndarray) + assert r.sycl_queue == x1.sycl_queue + assert r.sycl_queue == x2.sycl_queue + + x1_np = np.arange(x1_range_begin, x1_range_begin + n, dtype=op1_dtype) + x2_np = np.arange(0, n, dtype=op2_dtype) + r_np = np.right_shift(x1_np, x2_np) + + assert r.dtype == r_np.dtype + assert (dpt.asnumpy(r) == r_np).all() + + +@pytest.mark.parametrize("op1_dtype", _integral_dtypes) +@pytest.mark.parametrize("op2_dtype", _integral_dtypes) +def test_bitwise_right_shift_dtype_matrix_strided(op1_dtype, op2_dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(op1_dtype, q) + skip_if_dtype_not_supported(op2_dtype, q) + + if op1_dtype != op2_dtype and "u8" in [op1_dtype, op2_dtype]: + return + + sz = 11 + n = 2 * sz + dt1 = dpt.dtype(op1_dtype) + dt2 = dpt.dtype(op2_dtype) + + x1_range_begin = -sz if dpt.iinfo(dt1).min < 0 else 0 + x1 = dpt.arange(x1_range_begin, x1_range_begin + n, dtype=dt1)[::-2] + x2 = dpt.arange(0, n, dtype=dt2)[::2] + + r = dpt.bitwise_right_shift(x1, x2) + assert isinstance(r, dpt.usm_ndarray) + assert r.sycl_queue == x1.sycl_queue + assert r.sycl_queue == x2.sycl_queue + + x1_np = np.arange(x1_range_begin, x1_range_begin + n, dtype=dt1)[::-2] + x2_np = np.arange(0, n, dtype=dt2)[::2] + r_np = np.right_shift(x1_np, x2_np) + + assert r.dtype == r_np.dtype + assert (dpt.asnumpy(r) == r_np).all() + + +@pytest.mark.parametrize("op_dtype", _integral_dtypes) +def test_bitwise_right_shift_range(op_dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(op_dtype, q) + + x = dpt.ones(255, dtype=op_dtype) + y = dpt.asarray(64, dtype=op_dtype) + + z = dpt.bitwise_right_shift(x, y) + assert dpt.all(dpt.equal(z, 0)) + + +@pytest.mark.parametrize("dtype", _integral_dtypes) +def test_bitwise_right_shift_inplace_python_scalar(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + X = dpt.zeros((10, 10), dtype=dtype, sycl_queue=q) + X >>= int(0) + + +@pytest.mark.parametrize("op1_dtype", _integral_dtypes) +@pytest.mark.parametrize("op2_dtype", _integral_dtypes) +def test_bitwise_right_shift_inplace_dtype_matrix(op1_dtype, op2_dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(op1_dtype, q) + skip_if_dtype_not_supported(op2_dtype, q) + + sz = 127 + ar1 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q) + ar2 = dpt.ones_like(ar1, dtype=op2_dtype, sycl_queue=q) + + dev = q.sycl_device + _fp16 = dev.has_aspect_fp16 + _fp64 = dev.has_aspect_fp64 + if _can_cast(ar2.dtype, ar1.dtype, _fp16, _fp64): + ar1 >>= ar2 + assert dpt.all(ar1 == 0) + + ar3 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)[::-1] + ar4 = dpt.ones(2 * sz, dtype=op2_dtype, sycl_queue=q)[::2] + ar3 >>= ar4 + assert dpt.all(ar3 == 0) + else: + with pytest.raises(ValueError): + ar1 >>= ar2 + dpt.bitwise_right_shift(ar1, ar2, out=ar1) + + # out is second arg + ar1 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q) + ar2 = dpt.ones_like(ar1, dtype=op2_dtype, sycl_queue=q) + if _can_cast(ar1.dtype, ar2.dtype, _fp16, _fp64): + dpt.bitwise_right_shift(ar1, ar2, out=ar2) + assert dpt.all(ar2 == 0) + + ar3 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)[::-1] + ar4 = dpt.ones(2 * sz, dtype=op2_dtype, sycl_queue=q)[::2] + dpt.bitwise_right_shift(ar3, ar4, out=ar4) + dpt.all(ar4 == 0) + else: + with pytest.raises(ValueError): + dpt.bitwise_right_shift(ar1, ar2, out=ar2) diff --git a/dpnp/tests/tensor/elementwise/test_bitwise_xor.py b/dpnp/tests/tensor/elementwise/test_bitwise_xor.py new file mode 100644 index 000000000000..60bc2c518e26 --- /dev/null +++ b/dpnp/tests/tensor/elementwise/test_bitwise_xor.py @@ -0,0 +1,158 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import numpy as np +import pytest + +import dpnp.tensor as dpt +from dpnp.tensor._type_utils import _can_cast + +from ..helper import ( + get_queue_or_skip, + skip_if_dtype_not_supported, +) +from .utils import _integral_dtypes + + +@pytest.mark.parametrize("op_dtype", _integral_dtypes) +def test_bitwise_xor_dtype_matrix_contig(op_dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(op_dtype, q) + + sz = 7 + n = 2 * sz + dt1 = dpt.dtype(op_dtype) + dt2 = dpt.dtype(op_dtype) + + x1_range_begin = -sz if dpt.iinfo(dt1).min < 0 else 0 + x1 = dpt.arange(x1_range_begin, x1_range_begin + n, dtype=dt1) + + x2_range_begin = -sz if dpt.iinfo(dt2).min < 0 else 0 + x2 = dpt.arange(x2_range_begin, x2_range_begin + n, dtype=dt1) + + r = dpt.bitwise_xor(x1, x2) + assert isinstance(r, dpt.usm_ndarray) + + x1_np = np.arange(x1_range_begin, x1_range_begin + n, dtype=op_dtype) + x2_np = np.arange(x2_range_begin, x2_range_begin + n, dtype=op_dtype) + r_np = np.bitwise_xor(x1_np, x2_np) + + assert (r_np == dpt.asnumpy(r)).all() + + +@pytest.mark.parametrize("op_dtype", _integral_dtypes) +def test_bitwise_xor_dtype_matrix_strided(op_dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(op_dtype, q) + + sz = 11 + n = 2 * sz + dt1 = dpt.dtype(op_dtype) + dt2 = dpt.dtype(op_dtype) + + x1_range_begin = -sz if dpt.iinfo(dt1).min < 0 else 0 + x1 = dpt.arange(x1_range_begin, x1_range_begin + n, dtype=dt1)[::2] + + x2_range_begin = -(sz // 2) if dpt.iinfo(dt2).min < 0 else 0 + x2 = dpt.arange(x2_range_begin, x2_range_begin + n, dtype=dt1)[::-2] + + r = dpt.bitwise_xor(x1, x2) + assert isinstance(r, dpt.usm_ndarray) + + x1_np = np.arange(x1_range_begin, x1_range_begin + n, dtype=op_dtype)[::2] + x2_np = np.arange(x2_range_begin, x2_range_begin + n, dtype=op_dtype)[::-2] + r_np = np.bitwise_xor(x1_np, x2_np) + + assert (r_np == dpt.asnumpy(r)).all() + + +def test_bitwise_xor_bool(): + get_queue_or_skip() + + x1 = dpt.asarray([True, False]) + x2 = dpt.asarray([False, True]) + + r_bw = dpt.bitwise_xor(x1[:, dpt.newaxis], x2[dpt.newaxis]) + r_lo = dpt.logical_xor(x1[:, dpt.newaxis], x2[dpt.newaxis]) + + assert dpt.all(dpt.equal(r_bw, r_lo)) + + +@pytest.mark.parametrize("dtype", ["?"] + _integral_dtypes) +def test_bitwise_xor_inplace_python_scalar(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + X = dpt.zeros((10, 10), dtype=dtype, sycl_queue=q) + dt_kind = X.dtype.kind + if dt_kind == "b": + X ^= False + else: + X ^= int(0) + + +@pytest.mark.parametrize("op1_dtype", ["?"] + _integral_dtypes) +@pytest.mark.parametrize("op2_dtype", ["?"] + _integral_dtypes) +def test_bitwise_xor_inplace_dtype_matrix(op1_dtype, op2_dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(op1_dtype, q) + skip_if_dtype_not_supported(op2_dtype, q) + + sz = 127 + ar1 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q) + ar2 = dpt.ones_like(ar1, dtype=op2_dtype, sycl_queue=q) + + dev = q.sycl_device + _fp16 = dev.has_aspect_fp16 + _fp64 = dev.has_aspect_fp64 + if _can_cast(ar2.dtype, ar1.dtype, _fp16, _fp64, casting="same_kind"): + ar1 ^= ar2 + assert dpt.all(ar1 == 0) + + ar3 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)[::-1] + ar4 = dpt.ones(2 * sz, dtype=op2_dtype, sycl_queue=q)[::2] + ar3 ^= ar4 + assert dpt.all(ar3 == 0) + else: + with pytest.raises(ValueError): + ar1 ^= ar2 + dpt.bitwise_xor(ar1, ar2, out=ar1) + + # out is second arg + ar1 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q) + ar2 = dpt.ones_like(ar1, dtype=op2_dtype, sycl_queue=q) + if _can_cast(ar1.dtype, ar2.dtype, _fp16, _fp64): + dpt.bitwise_xor(ar1, ar2, out=ar2) + assert dpt.all(ar2 == 0) + + ar3 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)[::-1] + ar4 = dpt.ones(2 * sz, dtype=op2_dtype, sycl_queue=q)[::2] + dpt.bitwise_xor(ar3, ar4, out=ar4) + dpt.all(ar4 == 0) + else: + with pytest.raises(ValueError): + dpt.bitwise_xor(ar1, ar2, out=ar2) diff --git a/dpnp/tests/tensor/elementwise/test_cbrt.py b/dpnp/tests/tensor/elementwise/test_cbrt.py new file mode 100644 index 000000000000..8c063d3fbdec --- /dev/null +++ b/dpnp/tests/tensor/elementwise/test_cbrt.py @@ -0,0 +1,98 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import numpy as np +import pytest +from numpy.testing import assert_allclose + +import dpnp.tensor as dpt + +from ..helper import ( + get_queue_or_skip, + skip_if_dtype_not_supported, +) +from .utils import ( + _map_to_device_dtype, + _no_complex_dtypes, + _real_fp_dtypes, +) + + +@pytest.mark.parametrize("dtype", _no_complex_dtypes) +def test_cbrt_out_type(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + X = dpt.asarray(0, dtype=dtype, sycl_queue=q) + expected_dtype = np.cbrt(np.array(0, dtype=dtype)).dtype + expected_dtype = _map_to_device_dtype(expected_dtype, q.sycl_device) + assert dpt.cbrt(X).dtype == expected_dtype + + +@pytest.mark.parametrize("dtype", _real_fp_dtypes) +def test_cbrt_output_contig(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + n_seq = 1027 + + X = dpt.linspace(0, 13, num=n_seq, dtype=dtype, sycl_queue=q) + Xnp = dpt.asnumpy(X) + + Y = dpt.cbrt(X) + tol = 8 * dpt.finfo(Y.dtype).resolution + + assert_allclose(dpt.asnumpy(Y), np.cbrt(Xnp), atol=tol, rtol=tol) + + +@pytest.mark.parametrize("dtype", _real_fp_dtypes) +def test_cbrt_output_strided(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + n_seq = 2054 + + X = dpt.linspace(0, 13, num=n_seq, dtype=dtype, sycl_queue=q)[::-2] + Xnp = dpt.asnumpy(X) + + Y = dpt.cbrt(X) + tol = 8 * dpt.finfo(Y.dtype).resolution + + assert_allclose(dpt.asnumpy(Y), np.cbrt(Xnp), atol=tol, rtol=tol) + + +@pytest.mark.usefixtures("suppress_invalid_numpy_warnings") +def test_cbrt_special_cases(): + get_queue_or_skip() + + X = dpt.asarray([dpt.nan, 0.0, -0.0, dpt.inf, -dpt.inf], dtype="f4") + res = dpt.cbrt(X) + expected = dpt.asarray([dpt.nan, 0.0, -0.0, dpt.inf, -dpt.inf], dtype="f4") + tol = dpt.finfo(dpt.float32).resolution + + assert dpt.allclose(res, expected, atol=tol, rtol=tol, equal_nan=True) diff --git a/dpnp/tests/tensor/elementwise/test_complex.py b/dpnp/tests/tensor/elementwise/test_complex.py new file mode 100644 index 000000000000..2a006a7c519a --- /dev/null +++ b/dpnp/tests/tensor/elementwise/test_complex.py @@ -0,0 +1,243 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import itertools +import warnings + +import numpy as np +import pytest +from numpy.testing import assert_allclose + +import dpnp.tensor as dpt + +from ..helper import ( + get_queue_or_skip, + skip_if_dtype_not_supported, +) +from .utils import ( + _all_dtypes, + _map_to_device_dtype, + _usm_types, +) + + +@pytest.mark.parametrize("dtype", _all_dtypes) +def test_complex_out_type(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + X = dpt.asarray(0, dtype=dtype, sycl_queue=q) + expected_dtype = np.real(np.array(0, dtype=dtype)).dtype + expected_dtype = _map_to_device_dtype(expected_dtype, q.sycl_device) + assert dpt.real(X).dtype == expected_dtype + + expected_dtype = np.imag(np.array(0, dtype=dtype)).dtype + expected_dtype = _map_to_device_dtype(expected_dtype, q.sycl_device) + assert dpt.imag(X).dtype == expected_dtype + + expected_dtype = np.conj(np.array(0, dtype=dtype)).dtype + expected_dtype = _map_to_device_dtype(expected_dtype, q.sycl_device) + assert dpt.conj(X).dtype == expected_dtype + + +@pytest.mark.parametrize( + "np_call, dpt_call", + [(np.real, dpt.real), (np.imag, dpt.imag), (np.conj, dpt.conj)], +) +@pytest.mark.parametrize("dtype", _all_dtypes) +def test_complex_output(np_call, dpt_call, dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + n_seq = 100 + + x1 = np.linspace(0, 10, num=n_seq, dtype=dtype) + x2 = np.linspace(0, 20, num=n_seq, dtype=dtype) + Xnp = x1 + 1j * x2 + X = dpt.asarray(Xnp, sycl_queue=q) + + Y = dpt_call(X) + tol = 8 * dpt.finfo(Y.dtype).resolution + + assert_allclose(dpt.asnumpy(Y), np_call(Xnp), atol=tol, rtol=tol) + + Z = dpt.empty_like(X, dtype=Y.dtype) + dpt_call(X, out=Z) + + assert_allclose(dpt.asnumpy(Z), np_call(Xnp), atol=tol, rtol=tol) + + +@pytest.mark.parametrize( + "np_call, dpt_call", + [(np.real, dpt.real), (np.imag, dpt.imag), (np.conj, dpt.conj)], +) +@pytest.mark.parametrize("usm_type", _usm_types) +def test_complex_usm_type(np_call, dpt_call, usm_type): + q = get_queue_or_skip() + + arg_dt = np.dtype("c8") + input_shape = (10, 10, 10, 10) + X = dpt.empty(input_shape, dtype=arg_dt, usm_type=usm_type, sycl_queue=q) + X[..., 0::2] = np.pi / 6 + 1j * np.pi / 3 + X[..., 1::2] = np.pi / 3 + 1j * np.pi / 6 + + Y = dpt_call(X) + assert Y.usm_type == X.usm_type + assert Y.sycl_queue == X.sycl_queue + assert Y.flags.c_contiguous + + X_np = np.empty(input_shape, dtype=arg_dt) + X_np[..., 0::2] = np.complex64(np.pi / 6 + 1j * np.pi / 3) + X_np[..., 1::2] = np.complex64(np.pi / 3 + 1j * np.pi / 6) + + expected_Y = np_call(X_np) + + tol = 8 * dpt.finfo(Y.dtype).resolution + + assert_allclose(dpt.asnumpy(Y), expected_Y, atol=tol, rtol=tol) + + +@pytest.mark.parametrize( + "np_call, dpt_call", + [(np.real, dpt.real), (np.imag, dpt.imag), (np.conj, dpt.conj)], +) +@pytest.mark.parametrize("dtype", _all_dtypes) +def test_complex_order(np_call, dpt_call, dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + arg_dt = np.dtype(dtype) + input_shape = (10, 10, 10, 10) + X = dpt.empty(input_shape, dtype=arg_dt, sycl_queue=q) + X[..., 0::2] = np.pi / 6 + 1j * np.pi / 3 + X[..., 1::2] = np.pi / 3 + 1j * np.pi / 6 + + for perms in itertools.permutations(range(4)): + U = dpt.permute_dims(X[:, ::-1, ::-1, :], perms) + expected_Y = np_call(dpt.asnumpy(U)) + for ord in ["C", "F", "A", "K"]: + Y = dpt_call(U, order=ord) + assert_allclose(dpt.asnumpy(Y), expected_Y) + + +@pytest.mark.parametrize("dtype", ["c8", "c16"]) +def test_projection_complex(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + X = [ + complex(1, 2), + complex(dpt.inf, -1), + complex(0, -dpt.inf), + complex(-dpt.inf, dpt.nan), + ] + Y = [ + complex(1, 2), + complex(np.inf, -0.0), + complex(np.inf, -0.0), + complex(np.inf, 0.0), + ] + + Xf = dpt.asarray(X, dtype=dtype, sycl_queue=q) + Yf = np.array(Y, dtype=dtype) + + tol = 8 * dpt.finfo(Xf.dtype).resolution + assert_allclose(dpt.asnumpy(dpt.proj(Xf)), Yf, atol=tol, rtol=tol) + + +@pytest.mark.parametrize("dtype", _all_dtypes) +def test_projection(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + Xf = dpt.asarray(1, dtype=dtype, sycl_queue=q) + out_dtype = dpt.proj(Xf).dtype + Yf = np.array(complex(1, 0), dtype=out_dtype) + + tol = 8 * dpt.finfo(Yf.dtype).resolution + assert_allclose(dpt.asnumpy(dpt.proj(Xf)), Yf, atol=tol, rtol=tol) + + +@pytest.mark.parametrize( + "np_call, dpt_call", + [(np.real, dpt.real), (np.imag, dpt.imag), (np.conj, dpt.conj)], +) +@pytest.mark.parametrize("dtype", ["c8", "c16"]) +def test_complex_strided(np_call, dpt_call, dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + np.random.seed(42) + strides = np.array([-4, -3, -2, -1, 1, 2, 3, 4]) + sizes = [2, 4, 6, 8, 9, 24, 72] + tol = 8 * dpt.finfo(dtype).resolution + + low = -1000.0 + high = 1000.0 + for ii in sizes: + x1 = np.random.uniform(low=low, high=high, size=ii) + x2 = np.random.uniform(low=low, high=high, size=ii) + Xnp = np.array([complex(v1, v2) for v1, v2 in zip(x1, x2)], dtype=dtype) + X = dpt.asarray(Xnp) + Ynp = np_call(Xnp) + for jj in strides: + assert_allclose( + dpt.asnumpy(dpt_call(X[::jj])), + Ynp[::jj], + atol=tol, + rtol=tol, + ) + + +@pytest.mark.parametrize("dtype", ["c8", "c16"]) +def test_complex_special_cases(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + x = [np.nan, -np.nan, np.inf, -np.inf, +0.0, -0.0] + xc = [complex(*val) for val in itertools.product(x, repeat=2)] + + Xc_np = np.array(xc, dtype=dtype) + Xc = dpt.asarray(Xc_np, dtype=dtype, sycl_queue=q) + + tol = 8 * dpt.finfo(dtype).resolution + + actual = dpt.real(Xc) + expected = np.real(Xc_np) + assert_allclose(dpt.asnumpy(actual), expected, atol=tol, rtol=tol) + + actual = dpt.imag(Xc) + expected = np.imag(Xc_np) + assert_allclose(dpt.asnumpy(actual), expected, atol=tol, rtol=tol) + + actual = dpt.conj(Xc) + expected = np.conj(Xc_np) + + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + assert_allclose(dpt.asnumpy(actual), expected, atol=tol, rtol=tol) diff --git a/dpnp/tests/tensor/elementwise/test_copysign.py b/dpnp/tests/tensor/elementwise/test_copysign.py new file mode 100644 index 000000000000..f9ec5345d257 --- /dev/null +++ b/dpnp/tests/tensor/elementwise/test_copysign.py @@ -0,0 +1,130 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import ctypes + +import numpy as np +import pytest + +import dpnp.tensor as dpt + +from ..helper import ( + get_queue_or_skip, + skip_if_dtype_not_supported, +) +from .utils import ( + _compare_dtypes, + _no_complex_dtypes, + _real_fp_dtypes, +) + + +@pytest.mark.parametrize("op1_dtype", _no_complex_dtypes) +@pytest.mark.parametrize("op2_dtype", _no_complex_dtypes) +def test_copysign_dtype_matrix(op1_dtype, op2_dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(op1_dtype, q) + skip_if_dtype_not_supported(op2_dtype, q) + + sz = 127 + ar1 = dpt.ones(sz, dtype=op1_dtype) + ar2 = dpt.ones_like(ar1, dtype=op2_dtype) + + r = dpt.copysign(ar1, ar2) + assert isinstance(r, dpt.usm_ndarray) + expected = np.copysign( + np.ones(1, dtype=op1_dtype), np.ones(1, dtype=op2_dtype) + ) + assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q) + assert r.shape == ar1.shape + assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all() + assert r.sycl_queue == ar1.sycl_queue + + ar3 = dpt.ones(sz, dtype=op1_dtype) + ar4 = dpt.ones(2 * sz, dtype=op2_dtype) + + r = dpt.copysign(ar3[::-1], ar4[::2]) + assert isinstance(r, dpt.usm_ndarray) + expected = np.copysign( + np.ones(1, dtype=op1_dtype), np.ones(1, dtype=op2_dtype) + ) + assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q) + assert r.shape == ar3.shape + assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all() + + +@pytest.mark.parametrize("arr_dt", _real_fp_dtypes) +def test_copysign_python_scalar(arr_dt): + q = get_queue_or_skip() + skip_if_dtype_not_supported(arr_dt, q) + + X = dpt.ones((10, 10), dtype=arr_dt, sycl_queue=q) + py_ones = ( + bool(1), + int(1), + float(1), + np.float32(1), + ctypes.c_int(1), + ) + for sc in py_ones: + R = dpt.copysign(X, sc) + assert isinstance(R, dpt.usm_ndarray) + R = dpt.copysign(sc, X) + assert isinstance(R, dpt.usm_ndarray) + + +@pytest.mark.parametrize("dt", _real_fp_dtypes) +def test_copysign(dt): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dt, q) + + x = dpt.arange(100, dtype=dt, sycl_queue=q) + x[1::2] *= -1 + y = dpt.ones(100, dtype=dt, sycl_queue=q) + y[::2] *= -1 + res = dpt.copysign(x, y) + expected = dpt.negative(x) + tol = dpt.finfo(dt).resolution + assert dpt.allclose(res, expected, atol=tol, rtol=tol) + + +def test_copysign_special_values(): + get_queue_or_skip() + + x1 = dpt.asarray([1.0, 0.0, dpt.nan, dpt.nan], dtype="f4") + y1 = dpt.asarray([-1.0, -0.0, -dpt.nan, -1], dtype="f4") + res = dpt.copysign(x1, y1) + assert dpt.all(dpt.signbit(res)) + x2 = dpt.asarray([-1.0, -0.0, -dpt.nan, -dpt.nan], dtype="f4") + res = dpt.copysign(x2, y1) + assert dpt.all(dpt.signbit(res)) + y2 = dpt.asarray([0.0, 1.0, dpt.nan, 1.0], dtype="f4") + res = dpt.copysign(x2, y2) + assert not dpt.any(dpt.signbit(res)) + res = dpt.copysign(x1, y2) + assert not dpt.any(dpt.signbit(res)) diff --git a/dpnp/tests/tensor/elementwise/test_divide.py b/dpnp/tests/tensor/elementwise/test_divide.py new file mode 100644 index 000000000000..99de5a51214d --- /dev/null +++ b/dpnp/tests/tensor/elementwise/test_divide.py @@ -0,0 +1,311 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import ctypes + +import dpctl +import numpy as np +import pytest +from dpctl.utils import SequentialOrderManager + +import dpnp.tensor as dpt +from dpnp.tensor._tensor_elementwise_impl import _divide_by_scalar +from dpnp.tensor._type_utils import _can_cast + +from ..helper import ( + get_queue_or_skip, + skip_if_dtype_not_supported, +) +from .utils import ( + _all_dtypes, + _compare_dtypes, + _complex_fp_dtypes, + _real_fp_dtypes, + _usm_types, +) + + +@pytest.mark.parametrize("op1_dtype", _all_dtypes) +@pytest.mark.parametrize("op2_dtype", _all_dtypes) +def test_divide_dtype_matrix(op1_dtype, op2_dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(op1_dtype, q) + skip_if_dtype_not_supported(op2_dtype, q) + + sz = 127 + ar1 = dpt.ones(sz, dtype=op1_dtype) + ar2 = dpt.ones_like(ar1, dtype=op2_dtype) + + r = dpt.divide(ar1, ar2) + assert isinstance(r, dpt.usm_ndarray) + expected = np.divide( + np.ones(1, dtype=op1_dtype), np.ones(1, dtype=op2_dtype) + ) + assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q) + assert r.shape == ar1.shape + assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all() + assert r.sycl_queue == ar1.sycl_queue + + ar3 = dpt.ones(sz, dtype=op1_dtype) + ar4 = dpt.ones(2 * sz, dtype=op2_dtype) + + r = dpt.divide(ar3[::-1], ar4[::2]) + assert isinstance(r, dpt.usm_ndarray) + expected = np.divide( + np.ones(1, dtype=op1_dtype), np.ones(1, dtype=op2_dtype) + ) + assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q) + assert r.shape == ar3.shape + assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all() + + +@pytest.mark.parametrize("op1_usm_type", _usm_types) +@pytest.mark.parametrize("op2_usm_type", _usm_types) +def test_divide_usm_type_matrix(op1_usm_type, op2_usm_type): + get_queue_or_skip() + + sz = 128 + ar1 = dpt.ones(sz, dtype="i4", usm_type=op1_usm_type) + ar2 = dpt.ones_like(ar1, dtype="i4", usm_type=op2_usm_type) + + r = dpt.divide(ar1, ar2) + assert isinstance(r, dpt.usm_ndarray) + expected_usm_type = dpt.get_coerced_usm_type((op1_usm_type, op2_usm_type)) + assert r.usm_type == expected_usm_type + + +def test_divide_order(): + get_queue_or_skip() + + ar1 = dpt.ones((20, 20), dtype="i4", order="C") + ar2 = dpt.ones((20, 20), dtype="i4", order="C") + r1 = dpt.divide(ar1, ar2, order="C") + assert r1.flags.c_contiguous + r2 = dpt.divide(ar1, ar2, order="F") + assert r2.flags.f_contiguous + r3 = dpt.divide(ar1, ar2, order="A") + assert r3.flags.c_contiguous + r4 = dpt.divide(ar1, ar2, order="K") + assert r4.flags.c_contiguous + + ar1 = dpt.ones((20, 20), dtype="i4", order="F") + ar2 = dpt.ones((20, 20), dtype="i4", order="F") + r1 = dpt.divide(ar1, ar2, order="C") + assert r1.flags.c_contiguous + r2 = dpt.divide(ar1, ar2, order="F") + assert r2.flags.f_contiguous + r3 = dpt.divide(ar1, ar2, order="A") + assert r3.flags.f_contiguous + r4 = dpt.divide(ar1, ar2, order="K") + assert r4.flags.f_contiguous + + ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2] + ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2] + r4 = dpt.divide(ar1, ar2, order="K") + assert r4.strides == (20, -1) + + ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT + ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT + r4 = dpt.divide(ar1, ar2, order="K") + assert r4.strides == (-1, 20) + + +def test_divide_broadcasting(): + get_queue_or_skip() + + m = dpt.ones((100, 5), dtype="i4") + v = dpt.arange(1, 6, dtype="i4") + + r = dpt.divide(m, v) + + expected = np.divide( + np.ones((100, 5), dtype="i4"), np.arange(1, 6, dtype="i4") + ) + assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all() + + r2 = dpt.divide(v, m) + expected2 = np.divide( + np.arange(1, 6, dtype="i4"), np.ones((100, 5), dtype="i4") + ) + assert (dpt.asnumpy(r2) == expected2.astype(r2.dtype)).all() + + +@pytest.mark.parametrize("arr_dt", _all_dtypes) +def test_divide_python_scalar(arr_dt): + q = get_queue_or_skip() + skip_if_dtype_not_supported(arr_dt, q) + + X = dpt.ones((10, 10), dtype=arr_dt, sycl_queue=q) + py_ones = ( + bool(1), + int(1), + float(1), + complex(1), + np.float32(1), + ctypes.c_int(1), + ) + for sc in py_ones: + R = dpt.divide(X, sc) + assert isinstance(R, dpt.usm_ndarray) + R = dpt.divide(sc, X) + assert isinstance(R, dpt.usm_ndarray) + + +class MockArray: + def __init__(self, arr): + self.data_ = arr + + @property + def __sycl_usm_array_interface__(self): + return self.data_.__sycl_usm_array_interface__ + + +def test_divide_mock_array(): + get_queue_or_skip() + a = dpt.arange(10) + b = dpt.ones(10) + c = MockArray(b) + r = dpt.divide(a, c) + assert isinstance(r, dpt.usm_ndarray) + + +def test_divide_canary_mock_array(): + get_queue_or_skip() + a = dpt.arange(10) + + class Canary: + def __init__(self): + pass + + @property + def __sycl_usm_array_interface__(self): + return None + + c = Canary() + with pytest.raises(ValueError): + dpt.divide(a, c) + + +@pytest.mark.parametrize("dtype", _real_fp_dtypes + _complex_fp_dtypes) +def test_divide_inplace_python_scalar(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + X = dpt.zeros((10, 10), dtype=dtype, sycl_queue=q) + dt_kind = X.dtype.kind + if dt_kind == "f": + X /= float(1) + elif dt_kind == "c": + X /= complex(1) + + +@pytest.mark.parametrize("op1_dtype", _all_dtypes) +@pytest.mark.parametrize("op2_dtype", _all_dtypes) +def test_divide_inplace_dtype_matrix(op1_dtype, op2_dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(op1_dtype, q) + skip_if_dtype_not_supported(op2_dtype, q) + + sz = 127 + ar1 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q) + ar2 = dpt.ones_like(ar1, dtype=op2_dtype, sycl_queue=q) + + dev = q.sycl_device + _fp16 = dev.has_aspect_fp16 + _fp64 = dev.has_aspect_fp64 + # out array only valid if it is inexact + if ( + _can_cast(ar2.dtype, ar1.dtype, _fp16, _fp64, casting="same_kind") + and dpt.dtype(op1_dtype).kind in "fc" + ): + ar1 /= ar2 + assert dpt.all(ar1 == 1) + + ar3 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)[::-1] + ar4 = dpt.ones(2 * sz, dtype=op2_dtype, sycl_queue=q)[::2] + ar3 /= ar4 + assert dpt.all(ar3 == 1) + else: + with pytest.raises(ValueError): + ar1 /= ar2 + dpt.divide(ar1, ar2, out=ar1) + + # out is second arg + ar1 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q) + ar2 = dpt.ones_like(ar1, dtype=op2_dtype, sycl_queue=q) + if ( + _can_cast(ar1.dtype, ar2.dtype, _fp16, _fp64) + and dpt.dtype(op2_dtype).kind in "fc" + ): + dpt.divide(ar1, ar2, out=ar2) + assert dpt.all(ar2 == 1) + + ar3 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)[::-1] + ar4 = dpt.ones(2 * sz, dtype=op2_dtype, sycl_queue=q)[::2] + dpt.divide(ar3, ar4, out=ar4) + dpt.all(ar4 == 1) + else: + with pytest.raises(ValueError): + dpt.divide(ar1, ar2, out=ar2) + + +def test_divide_gh_1711(): + "See https://github.com/IntelPython/dpctl/issues/1711" + get_queue_or_skip() + + res = dpt.divide(-4, dpt.asarray(1, dtype="u4")) + assert isinstance(res, dpt.usm_ndarray) + assert res.dtype.kind == "f" + assert dpt.allclose(res, -4 / dpt.asarray(1, dtype="i4")) + + res = dpt.divide(dpt.asarray(3, dtype="u4"), -2) + assert isinstance(res, dpt.usm_ndarray) + assert res.dtype.kind == "f" + assert dpt.allclose(res, dpt.asarray(3, dtype="i4") / -2) + + +# don't test for overflowing double as Python won't cast +# a Python integer of that size to a Python float +@pytest.mark.parametrize("fp_dt", [dpt.float16, dpt.float32]) +def test_divide_by_scalar_overflow(fp_dt): + q = get_queue_or_skip() + skip_if_dtype_not_supported(fp_dt, q) + + x = dpt.ones(10, dtype=fp_dt, sycl_queue=q) + out = dpt.empty_like(x) + + max_exp = np.finfo(fp_dt).maxexp + sca = 2**max_exp + + _manager = SequentialOrderManager[q] + dep_evs = _manager.submitted_events + _, ev = _divide_by_scalar( + src=x, scalar=sca, dst=out, sycl_queue=q, depends=dep_evs + ) + ev.wait() + + assert dpt.all(out == 0) diff --git a/dpnp/tests/tensor/elementwise/test_elementwise_classes.py b/dpnp/tests/tensor/elementwise/test_elementwise_classes.py new file mode 100644 index 000000000000..04b92937f371 --- /dev/null +++ b/dpnp/tests/tensor/elementwise/test_elementwise_classes.py @@ -0,0 +1,150 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import pytest + +import dpnp.tensor as dpt + +from ..helper import get_queue_or_skip + +unary_fn = dpt.negative +binary_fn = dpt.divide + + +def test_unary_class_getters(): + fn = unary_fn.get_implementation_function() + assert callable(fn) + + fn = unary_fn.get_type_result_resolver_function() + assert callable(fn) + + +def test_unary_class_types_property(): + get_queue_or_skip() + loop_types = unary_fn.types + assert isinstance(loop_types, list) + assert len(loop_types) > 0 + assert all(isinstance(sig, str) for sig in loop_types) + assert all("->" in sig for sig in loop_types) + + +def test_unary_class_str_repr(): + s = str(unary_fn) + r = repr(unary_fn) + + assert isinstance(s, str) + assert isinstance(r, str) + kl_n = unary_fn.__name__ + assert kl_n in s + assert kl_n in r + + +def test_unary_read_only_out(): + get_queue_or_skip() + x = dpt.arange(32, dtype=dpt.int32) + r = dpt.empty_like(x) + r.flags["W"] = False + with pytest.raises(ValueError): + unary_fn(x, out=r) + + +def test_binary_class_getters(): + fn = binary_fn.get_implementation_function() + assert callable(fn) + + fn = binary_fn.get_implementation_inplace_function() + assert callable(fn) + + fn = binary_fn.get_type_result_resolver_function() + assert callable(fn) + + fn = binary_fn.get_type_promotion_path_acceptance_function() + assert callable(fn) + + +def test_binary_class_types_property(): + get_queue_or_skip() + loop_types = binary_fn.types + assert isinstance(loop_types, list) + assert len(loop_types) > 0 + assert all(isinstance(sig, str) for sig in loop_types) + assert all("->" in sig for sig in loop_types) + + +def test_binary_class_str_repr(): + s = str(binary_fn) + r = repr(binary_fn) + + assert isinstance(s, str) + assert isinstance(r, str) + kl_n = binary_fn.__name__ + assert kl_n in s + assert kl_n in r + + +def test_unary_class_nin(): + nin = unary_fn.nin + assert isinstance(nin, int) + assert nin == 1 + + +def test_binary_class_nin(): + nin = binary_fn.nin + assert isinstance(nin, int) + assert nin == 2 + + +def test_unary_class_nout(): + nout = unary_fn.nout + assert isinstance(nout, int) + assert nout == 1 + + +def test_binary_class_nout(): + nout = binary_fn.nout + assert isinstance(nout, int) + assert nout == 1 + + +def test_binary_read_only_out(): + get_queue_or_skip() + x1 = dpt.ones(32, dtype=dpt.float32) + x2 = dpt.ones_like(x1) + r = dpt.empty_like(x1) + r.flags["W"] = False + with pytest.raises(ValueError): + binary_fn(x1, x2, out=r) + + +def test_binary_no_inplace_op(): + get_queue_or_skip() + x1 = dpt.ones(10, dtype="i4") + x2 = dpt.ones_like(x1) + + with pytest.raises(ValueError): + dpt.logaddexp._inplace_op(x1, x2) diff --git a/dpnp/tests/tensor/elementwise/test_equal.py b/dpnp/tests/tensor/elementwise/test_equal.py new file mode 100644 index 000000000000..f5e0cd520762 --- /dev/null +++ b/dpnp/tests/tensor/elementwise/test_equal.py @@ -0,0 +1,207 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import ctypes + +import dpctl +import numpy as np +import pytest + +import dpnp.tensor as dpt + +from ..helper import ( + get_queue_or_skip, + skip_if_dtype_not_supported, +) +from .utils import ( + _all_dtypes, + _compare_dtypes, + _usm_types, +) + + +@pytest.mark.parametrize("op1_dtype", _all_dtypes) +@pytest.mark.parametrize("op2_dtype", _all_dtypes) +def test_equal_dtype_matrix(op1_dtype, op2_dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(op1_dtype, q) + skip_if_dtype_not_supported(op2_dtype, q) + + sz = 127 + ar1 = dpt.ones(sz, dtype=op1_dtype) + ar2 = dpt.ones_like(ar1, dtype=op2_dtype) + + r = dpt.equal(ar1, ar2) + assert isinstance(r, dpt.usm_ndarray) + expected_dtype = np.equal( + np.zeros(1, dtype=op1_dtype), np.zeros(1, dtype=op2_dtype) + ).dtype + assert _compare_dtypes(r.dtype, expected_dtype, sycl_queue=q) + assert r.shape == ar1.shape + assert (dpt.asnumpy(r) == np.full(r.shape, True, dtype=r.dtype)).all() + assert r.sycl_queue == ar1.sycl_queue + + ar3 = dpt.ones(sz, dtype=op1_dtype) + ar4 = dpt.ones(2 * sz, dtype=op2_dtype) + + r = dpt.equal(ar3[::-1], ar4[::2]) + assert isinstance(r, dpt.usm_ndarray) + expected_dtype = np.equal( + np.ones(1, dtype=op1_dtype), np.ones(1, dtype=op2_dtype) + ).dtype + assert _compare_dtypes(r.dtype, expected_dtype, sycl_queue=q) + assert r.shape == ar3.shape + assert (dpt.asnumpy(r) == np.full(r.shape, True, dtype=r.dtype)).all() + + +@pytest.mark.parametrize("op1_usm_type", _usm_types) +@pytest.mark.parametrize("op2_usm_type", _usm_types) +def test_equal_usm_type_matrix(op1_usm_type, op2_usm_type): + get_queue_or_skip() + + sz = 128 + ar1 = dpt.ones(sz, dtype="i4", usm_type=op1_usm_type) + ar2 = dpt.ones_like(ar1, dtype="i4", usm_type=op2_usm_type) + + r = dpt.equal(ar1, ar2) + assert isinstance(r, dpt.usm_ndarray) + expected_usm_type = dpt.get_coerced_usm_type((op1_usm_type, op2_usm_type)) + assert r.usm_type == expected_usm_type + + +def test_equal_order(): + get_queue_or_skip() + + ar1 = dpt.ones((20, 20), dtype="i4", order="C") + ar2 = dpt.ones((20, 20), dtype="i4", order="C") + r1 = dpt.equal(ar1, ar2, order="C") + assert r1.flags.c_contiguous + r2 = dpt.equal(ar1, ar2, order="F") + assert r2.flags.f_contiguous + r3 = dpt.equal(ar1, ar2, order="A") + assert r3.flags.c_contiguous + r4 = dpt.equal(ar1, ar2, order="K") + assert r4.flags.c_contiguous + + ar1 = dpt.ones((20, 20), dtype="i4", order="F") + ar2 = dpt.ones((20, 20), dtype="i4", order="F") + r1 = dpt.equal(ar1, ar2, order="C") + assert r1.flags.c_contiguous + r2 = dpt.equal(ar1, ar2, order="F") + assert r2.flags.f_contiguous + r3 = dpt.equal(ar1, ar2, order="A") + assert r3.flags.f_contiguous + r4 = dpt.equal(ar1, ar2, order="K") + assert r4.flags.f_contiguous + + ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2] + ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2] + r4 = dpt.equal(ar1, ar2, order="K") + assert r4.strides == (20, -1) + + ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT + ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT + r4 = dpt.equal(ar1, ar2, order="K") + assert r4.strides == (-1, 20) + + +def test_equal_broadcasting(): + get_queue_or_skip() + + m = dpt.ones((100, 5), dtype="i4") + v = dpt.arange(5, dtype="i4") + + r = dpt.equal(m, v) + expected = np.full((100, 5), [False, True, False, False, False], dtype="?") + + assert (dpt.asnumpy(r) == expected).all() + + r2 = dpt.equal(v, m) + assert (dpt.asnumpy(r2) == expected).all() + + r3 = dpt.empty_like(m, dtype="?") + dpt.equal(m, v, out=r3) + assert (dpt.asnumpy(r3) == expected).all() + + +@pytest.mark.parametrize("arr_dt", _all_dtypes) +def test_equal_python_scalar(arr_dt): + q = get_queue_or_skip() + skip_if_dtype_not_supported(arr_dt, q) + + X = dpt.zeros((10, 10), dtype=arr_dt, sycl_queue=q) + py_zeros = ( + bool(0), + int(0), + float(0), + complex(0), + np.float32(0), + ctypes.c_int(0), + ) + for sc in py_zeros: + R = dpt.equal(X, sc) + assert isinstance(R, dpt.usm_ndarray) + assert dpt.all(R) + R = dpt.equal(sc, X) + assert isinstance(R, dpt.usm_ndarray) + assert dpt.all(R) + + +class MockArray: + def __init__(self, arr): + self.data_ = arr + + @property + def __sycl_usm_array_interface__(self): + return self.data_.__sycl_usm_array_interface__ + + +def test_equal_mock_array(): + get_queue_or_skip() + a = dpt.arange(10) + b = dpt.ones(10) + c = MockArray(b) + r = dpt.equal(a, c) + assert isinstance(r, dpt.usm_ndarray) + + +def test_equal_canary_mock_array(): + get_queue_or_skip() + a = dpt.arange(10) + + class Canary: + def __init__(self): + pass + + @property + def __sycl_usm_array_interface__(self): + return None + + c = Canary() + with pytest.raises(ValueError): + dpt.equal(a, c) diff --git a/dpnp/tests/tensor/elementwise/test_exp.py b/dpnp/tests/tensor/elementwise/test_exp.py new file mode 100644 index 000000000000..ca204128317e --- /dev/null +++ b/dpnp/tests/tensor/elementwise/test_exp.py @@ -0,0 +1,254 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import itertools + +import numpy as np +import pytest +from numpy.testing import assert_allclose, assert_array_equal + +import dpnp.tensor as dpt + +from ..helper import ( + get_queue_or_skip, + skip_if_dtype_not_supported, +) +from .utils import ( + _all_dtypes, + _map_to_device_dtype, + _usm_types, +) + + +@pytest.mark.parametrize("dtype", _all_dtypes) +def test_exp_out_type(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + X = dpt.asarray(0, dtype=dtype, sycl_queue=q) + expected_dtype = np.exp(np.array(0, dtype=dtype)).dtype + expected_dtype = _map_to_device_dtype(expected_dtype, q.sycl_device) + assert dpt.exp(X).dtype == expected_dtype + + +@pytest.mark.parametrize("dtype", ["f2", "f4", "f8"]) +def test_exp_real_contig(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + n_seq = 100 + n_rep = 137 + Xnp = np.linspace(0.01, 88.1, num=n_seq, dtype=dtype) + X = dpt.asarray(np.repeat(Xnp, n_rep), dtype=dtype, sycl_queue=q) + Y = dpt.exp(X) + with np.errstate(all="ignore"): + Ynp = np.exp(Xnp) + + tol = 8 * dpt.finfo(dtype).resolution + assert_allclose(dpt.asnumpy(Y), np.repeat(Ynp, n_rep), atol=tol, rtol=tol) + + Z = dpt.empty_like(X, dtype=dtype) + dpt.exp(X, out=Z) + + assert_allclose(dpt.asnumpy(Z), np.repeat(Ynp, n_rep), atol=tol, rtol=tol) + + +@pytest.mark.filterwarnings("ignore::RuntimeWarning") +@pytest.mark.parametrize("dtype", ["c8", "c16"]) +def test_exp_complex_contig(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + n_seq = 100 + n_rep = 137 + low = -88.0 + high = 88.0 + x1 = np.random.uniform(low=low, high=high, size=n_seq) + x2 = np.random.uniform(low=low, high=high, size=n_seq) + Xnp = np.array([complex(v1, v2) for v1, v2 in zip(x1, x2)], dtype=dtype) + + X = dpt.asarray(np.repeat(Xnp, n_rep), dtype=dtype, sycl_queue=q) + Y = dpt.exp(X) + + tol = 8 * dpt.finfo(dtype).resolution + assert_allclose( + dpt.asnumpy(Y), np.repeat(np.exp(Xnp), n_rep), atol=tol, rtol=tol + ) + + Z = dpt.empty_like(X, dtype=dtype) + dpt.exp(X, out=Z) + + assert_allclose( + dpt.asnumpy(Z), np.repeat(np.exp(Xnp), n_rep), atol=tol, rtol=tol + ) + + +@pytest.mark.parametrize("usm_type", _usm_types) +def test_exp_usm_type(usm_type): + q = get_queue_or_skip() + + arg_dt = np.dtype("f4") + input_shape = (10, 10, 10, 10) + X = dpt.empty(input_shape, dtype=arg_dt, usm_type=usm_type, sycl_queue=q) + X[..., 0::2] = 16.0 + X[..., 1::2] = 23.0 + + Y = dpt.exp(X) + assert Y.usm_type == X.usm_type + assert Y.sycl_queue == X.sycl_queue + assert Y.flags.c_contiguous + + expected_Y = np.empty(input_shape, dtype=arg_dt) + expected_Y[..., 0::2] = np.exp(np.float32(16.0)) + expected_Y[..., 1::2] = np.exp(np.float32(23.0)) + tol = 8 * dpt.finfo(Y.dtype).resolution + + assert_allclose(dpt.asnumpy(Y), expected_Y, atol=tol, rtol=tol) + + +@pytest.mark.parametrize("dtype", _all_dtypes) +def test_exp_order(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + arg_dt = np.dtype(dtype) + input_shape = (10, 10, 10, 10) + X = dpt.empty(input_shape, dtype=arg_dt, sycl_queue=q) + X[..., 0::2] = 8.0 + X[..., 1::2] = 11.0 + + for perms in itertools.permutations(range(4)): + U = dpt.permute_dims(X[:, ::-1, ::-1, :], perms) + expected_Y = np.exp(dpt.asnumpy(U)) + for ord in ["C", "F", "A", "K"]: + Y = dpt.exp(U, order=ord) + tol = 8 * max( + dpt.finfo(Y.dtype).resolution, + np.finfo(expected_Y.dtype).resolution, + ) + assert_allclose(dpt.asnumpy(Y), expected_Y, atol=tol, rtol=tol) + + +@pytest.mark.parametrize("dtype", ["f2", "f4", "f8"]) +def test_exp_analytical_values(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + tol = 8 * dpt.finfo(dtype).resolution + x = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10] + log2_ = 0.69314718055994530943 + Xnp = np.array(x, dtype=dtype) * log2_ + X = dpt.asarray(Xnp, dtype=dtype) + assert_allclose(dpt.asnumpy(dpt.exp(X)), np.exp(Xnp), atol=tol, rtol=tol) + + +@pytest.mark.parametrize("dtype", ["f2", "f4", "f8"]) +def test_exp_real_special_cases(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + tol = 8 * dpt.finfo(dtype).resolution + x = [np.nan, np.inf, -np.inf, 0.0, -0.0] + Xnp = np.array(x, dtype=dtype) + X = dpt.asarray(x, dtype=dtype) + + Y = dpt.asnumpy(dpt.exp(X)) + Ynp = np.exp(Xnp) + assert_allclose(Y, Ynp, atol=tol, rtol=tol) + assert_array_equal(np.signbit(Y), np.signbit(Ynp)) + + +@pytest.mark.parametrize("dtype", ["f2", "f4", "f8"]) +def test_exp_real_strided(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + np.random.seed(42) + strides = np.array([-4, -3, -2, -1, 1, 2, 3, 4]) + sizes = [2, 4, 6, 8, 9, 24, 72] + tol = 8 * dpt.finfo(dtype).resolution + + for ii in sizes: + Xnp = np.random.uniform(low=0.01, high=88.1, size=ii) + Xnp.astype(dtype) + X = dpt.asarray(Xnp) + Ynp = np.exp(Xnp) + for jj in strides: + assert_allclose( + dpt.asnumpy(dpt.exp(X[::jj])), + Ynp[::jj], + atol=tol, + rtol=tol, + ) + + +@pytest.mark.parametrize("dtype", ["c8", "c16"]) +def test_exp_complex_strided(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + np.random.seed(42) + strides = np.array([-4, -3, -2, -1, 1, 2, 3, 4]) + sizes = [2, 4, 6, 8, 9, 24, 72] + tol = 8 * dpt.finfo(dtype).resolution + + low = -88.0 + high = 88.0 + for ii in sizes: + x1 = np.random.uniform(low=low, high=high, size=ii) + x2 = np.random.uniform(low=low, high=high, size=ii) + Xnp = np.array([complex(v1, v2) for v1, v2 in zip(x1, x2)], dtype=dtype) + X = dpt.asarray(Xnp) + Ynp = np.exp(Xnp) + for jj in strides: + assert_allclose( + dpt.asnumpy(dpt.exp(X[::jj])), + Ynp[::jj], + atol=tol, + rtol=tol, + ) + + +@pytest.mark.parametrize("dtype", ["c8", "c16"]) +def test_exp_complex_special_cases(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + x = [np.nan, np.inf, -np.inf, +0.0, -0.0, +1.0, -1.0] + xc = [complex(*val) for val in itertools.product(x, repeat=2)] + + Xc_np = np.array(xc, dtype=dtype) + Xc = dpt.asarray(Xc_np, dtype=dtype, sycl_queue=q) + + with np.errstate(all="ignore"): + Ynp = np.exp(Xc_np) + Y = dpt.exp(Xc) + + tol = 8 * dpt.finfo(dtype).resolution + assert_allclose(dpt.asnumpy(dpt.real(Y)), np.real(Ynp), atol=tol, rtol=tol) + assert_allclose(dpt.asnumpy(dpt.imag(Y)), np.imag(Ynp), atol=tol, rtol=tol) diff --git a/dpnp/tests/tensor/elementwise/test_exp2.py b/dpnp/tests/tensor/elementwise/test_exp2.py new file mode 100644 index 000000000000..ae2ab43c39be --- /dev/null +++ b/dpnp/tests/tensor/elementwise/test_exp2.py @@ -0,0 +1,187 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import itertools + +import numpy as np +import pytest +from numpy.testing import assert_allclose + +import dpnp.tensor as dpt + +from ..helper import ( + get_queue_or_skip, + skip_if_dtype_not_supported, +) +from .utils import ( + _all_dtypes, + _map_to_device_dtype, + _usm_types, +) + + +@pytest.mark.parametrize("dtype", _all_dtypes) +def test_exp2_out_type(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + X = dpt.asarray(0, dtype=dtype, sycl_queue=q) + expected_dtype = np.exp2(np.array(0, dtype=dtype)).dtype + expected_dtype = _map_to_device_dtype(expected_dtype, q.sycl_device) + assert dpt.exp2(X).dtype == expected_dtype + + +@pytest.mark.parametrize("dtype", ["f2", "f4", "f8", "c8", "c16"]) +def test_exp2_output_contig(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + n_seq = 1027 + + X = dpt.linspace(1, 5, num=n_seq, dtype=dtype, sycl_queue=q) + Xnp = dpt.asnumpy(X) + + Y = dpt.exp2(X) + tol = 8 * dpt.finfo(Y.dtype).resolution + + assert_allclose(dpt.asnumpy(Y), np.exp2(Xnp), atol=tol, rtol=tol) + + +@pytest.mark.parametrize("dtype", ["f2", "f4", "f8", "c8", "c16"]) +def test_exp2_output_strided(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + n_seq = 2 * 1027 + + X = dpt.linspace(1, 5, num=n_seq, dtype=dtype, sycl_queue=q)[::-2] + Xnp = dpt.asnumpy(X) + + Y = dpt.exp2(X) + tol = 8 * dpt.finfo(Y.dtype).resolution + + assert_allclose(dpt.asnumpy(Y), np.exp2(Xnp), atol=tol, rtol=tol) + + +@pytest.mark.parametrize("usm_type", _usm_types) +def test_exp2_usm_type(usm_type): + q = get_queue_or_skip() + + arg_dt = np.dtype("f4") + input_shape = (10, 10, 10, 10) + X = dpt.empty(input_shape, dtype=arg_dt, usm_type=usm_type, sycl_queue=q) + X[..., 0::2] = 1 / 4 + X[..., 1::2] = 1 / 2 + + Y = dpt.exp2(X) + assert Y.usm_type == X.usm_type + assert Y.sycl_queue == X.sycl_queue + assert Y.flags.c_contiguous + + expected_Y = np.empty(input_shape, dtype=arg_dt) + expected_Y[..., 0::2] = np.exp2(np.float32(1 / 4)) + expected_Y[..., 1::2] = np.exp2(np.float32(1 / 2)) + tol = 8 * dpt.finfo(Y.dtype).resolution + + assert_allclose(dpt.asnumpy(Y), expected_Y, atol=tol, rtol=tol) + + +@pytest.mark.parametrize("dtype", _all_dtypes) +def test_exp2_order(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + arg_dt = np.dtype(dtype) + input_shape = (10, 10, 10, 10) + X = dpt.empty(input_shape, dtype=arg_dt, sycl_queue=q) + X[..., 0::2] = 1 / 4 + X[..., 1::2] = 1 / 2 + + for perms in itertools.permutations(range(4)): + U = dpt.permute_dims(X[:, ::-1, ::-1, :], perms) + expected_Y = np.exp2(dpt.asnumpy(U)) + for ord in ["C", "F", "A", "K"]: + Y = dpt.exp2(U, order=ord) + tol = 8 * max( + dpt.finfo(Y.dtype).resolution, + np.finfo(expected_Y.dtype).resolution, + ) + assert_allclose(dpt.asnumpy(Y), expected_Y, atol=tol, rtol=tol) + + +def test_exp2_special_cases(): + get_queue_or_skip() + + X = dpt.asarray([dpt.nan, 0.0, -0.0, dpt.inf, -dpt.inf], dtype="f4") + res = np.asarray([np.nan, 1.0, 1.0, np.inf, 0.0], dtype="f4") + + tol = dpt.finfo(X.dtype).resolution + assert_allclose(dpt.asnumpy(dpt.exp2(X)), res, atol=tol, rtol=tol) + + # special cases for complex variant + num_finite = 1.0 + vals = [ + complex(0.0, 0.0), + complex(num_finite, dpt.inf), + complex(num_finite, dpt.nan), + complex(dpt.inf, 0.0), + complex(-dpt.inf, num_finite), + complex(dpt.inf, num_finite), + complex(-dpt.inf, dpt.inf), + complex(dpt.inf, dpt.inf), + complex(-dpt.inf, dpt.nan), + complex(dpt.inf, dpt.nan), + complex(dpt.nan, 0.0), + complex(dpt.nan, num_finite), + complex(dpt.nan, dpt.nan), + ] + X = dpt.asarray(vals, dtype=dpt.complex64) + cis_1 = complex(np.cos(num_finite), np.sin(num_finite)) + c_nan = complex(np.nan, np.nan) + res = np.asarray( + [ + complex(1.0, 0.0), + c_nan, + c_nan, + complex(np.inf, 0.0), + 0.0, + np.inf * cis_1, + complex(0.0, 0.0), + complex(np.inf, np.nan), + complex(0.0, 0.0), + complex(np.inf, np.nan), + complex(np.nan, 0.0), + c_nan, + c_nan, + ], + dtype=np.complex64, + ) + + tol = dpt.finfo(X.dtype).resolution + with np.errstate(invalid="ignore"): + assert_allclose(dpt.asnumpy(dpt.exp2(X)), res, atol=tol, rtol=tol) diff --git a/dpnp/tests/tensor/elementwise/test_expm1.py b/dpnp/tests/tensor/elementwise/test_expm1.py new file mode 100644 index 000000000000..bb665c424564 --- /dev/null +++ b/dpnp/tests/tensor/elementwise/test_expm1.py @@ -0,0 +1,187 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import itertools + +import numpy as np +import pytest +from numpy.testing import assert_allclose + +import dpnp.tensor as dpt + +from ..helper import ( + get_queue_or_skip, + skip_if_dtype_not_supported, +) +from .utils import ( + _all_dtypes, + _map_to_device_dtype, + _usm_types, +) + + +@pytest.mark.parametrize("dtype", _all_dtypes) +def test_expm1_out_type(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + X = dpt.asarray(0, dtype=dtype, sycl_queue=q) + expected_dtype = np.expm1(np.array(0, dtype=dtype)).dtype + expected_dtype = _map_to_device_dtype(expected_dtype, q.sycl_device) + assert dpt.expm1(X).dtype == expected_dtype + + +@pytest.mark.parametrize("dtype", ["f2", "f4", "f8", "c8", "c16"]) +def test_expm1_output_contig(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + n_seq = 1027 + + X = dpt.linspace(-2, 2, num=n_seq, dtype=dtype, sycl_queue=q) + Xnp = dpt.asnumpy(X) + + Y = dpt.expm1(X) + tol = 8 * dpt.finfo(Y.dtype).resolution + + assert_allclose(dpt.asnumpy(Y), np.expm1(Xnp), atol=tol, rtol=tol) + + +@pytest.mark.parametrize("dtype", ["f2", "f4", "f8", "c8", "c16"]) +def test_expm1_output_strided(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + n_seq = 2 * 1027 + + X = dpt.linspace(-2, 2, num=n_seq, dtype=dtype, sycl_queue=q)[::-2] + Xnp = dpt.asnumpy(X) + + Y = dpt.expm1(X) + tol = 8 * dpt.finfo(Y.dtype).resolution + + assert_allclose(dpt.asnumpy(Y), np.expm1(Xnp), atol=tol, rtol=tol) + + +@pytest.mark.parametrize("usm_type", _usm_types) +def test_expm1_usm_type(usm_type): + q = get_queue_or_skip() + + arg_dt = np.dtype("f4") + input_shape = (10, 10, 10, 10) + X = dpt.empty(input_shape, dtype=arg_dt, usm_type=usm_type, sycl_queue=q) + X[..., 0::2] = 1 / 50 + X[..., 1::2] = 1 / 25 + + Y = dpt.expm1(X) + assert Y.usm_type == X.usm_type + assert Y.sycl_queue == X.sycl_queue + assert Y.flags.c_contiguous + + expected_Y = np.empty(input_shape, dtype=arg_dt) + expected_Y[..., 0::2] = np.expm1(np.float32(1 / 50)) + expected_Y[..., 1::2] = np.expm1(np.float32(1 / 25)) + tol = 8 * dpt.finfo(Y.dtype).resolution + + assert_allclose(dpt.asnumpy(Y), expected_Y, atol=tol, rtol=tol) + + +@pytest.mark.parametrize("dtype", _all_dtypes) +def test_expm1_order(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + arg_dt = np.dtype(dtype) + input_shape = (10, 10, 10, 10) + X = dpt.empty(input_shape, dtype=arg_dt, sycl_queue=q) + X[..., 0::2] = 1 / 50 + X[..., 1::2] = 1 / 25 + + for perms in itertools.permutations(range(4)): + U = dpt.permute_dims(X[:, ::-1, ::-1, :], perms) + expected_Y = np.expm1(dpt.asnumpy(U)) + for ord in ["C", "F", "A", "K"]: + Y = dpt.expm1(U, order=ord) + tol = 8 * max( + dpt.finfo(Y.dtype).resolution, + np.finfo(expected_Y.dtype).resolution, + ) + assert_allclose(dpt.asnumpy(Y), expected_Y, atol=tol, rtol=tol) + + +def test_expm1_special_cases(): + get_queue_or_skip() + + X = dpt.asarray([dpt.nan, 0.0, -0.0, dpt.inf, -dpt.inf], dtype="f4") + res = np.asarray([np.nan, 0.0, -0.0, np.inf, -1.0], dtype="f4") + + tol = dpt.finfo(X.dtype).resolution + assert_allclose(dpt.asnumpy(dpt.expm1(X)), res, atol=tol, rtol=tol) + + # special cases for complex variant + num_finite = 1.0 + vals = [ + complex(0.0, 0.0), + complex(num_finite, dpt.inf), + complex(num_finite, dpt.nan), + complex(dpt.inf, 0.0), + complex(-dpt.inf, num_finite), + complex(dpt.inf, num_finite), + complex(-dpt.inf, dpt.inf), + complex(dpt.inf, dpt.inf), + complex(-dpt.inf, dpt.nan), + complex(dpt.inf, dpt.nan), + complex(dpt.nan, 0.0), + complex(dpt.nan, num_finite), + complex(dpt.nan, dpt.nan), + ] + X = dpt.asarray(vals, dtype=dpt.complex64) + cis_1 = complex(np.cos(num_finite), np.sin(num_finite)) + c_nan = complex(np.nan, np.nan) + res = np.asarray( + [ + complex(0.0, 0.0), + c_nan, + c_nan, + complex(np.inf, 0.0), + 0.0 * cis_1 - 1.0, + np.inf * cis_1 - 1.0, + complex(-1.0, 0.0), + complex(np.inf, np.nan), + complex(-1.0, 0.0), + complex(np.inf, np.nan), + complex(np.nan, 0.0), + c_nan, + c_nan, + ], + dtype=np.complex64, + ) + + tol = dpt.finfo(X.dtype).resolution + with np.errstate(invalid="ignore"): + assert_allclose(dpt.asnumpy(dpt.expm1(X)), res, atol=tol, rtol=tol) diff --git a/dpnp/tests/tensor/elementwise/test_floor_ceil_trunc.py b/dpnp/tests/tensor/elementwise/test_floor_ceil_trunc.py new file mode 100644 index 000000000000..f9af864b29fe --- /dev/null +++ b/dpnp/tests/tensor/elementwise/test_floor_ceil_trunc.py @@ -0,0 +1,182 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import itertools +import re + +import numpy as np +import pytest +from numpy.testing import assert_allclose, assert_array_equal + +import dpnp.tensor as dpt + +from ..helper import ( + get_queue_or_skip, + skip_if_dtype_not_supported, +) +from .utils import ( + _map_to_device_dtype, + _no_complex_dtypes, + _real_value_dtypes, +) + +_all_funcs = [(np.floor, dpt.floor), (np.ceil, dpt.ceil), (np.trunc, dpt.trunc)] + + +@pytest.mark.parametrize("dpt_call", [dpt.floor, dpt.ceil, dpt.trunc]) +@pytest.mark.parametrize("dtype", _no_complex_dtypes) +def test_floor_ceil_trunc_out_type(dpt_call, dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + arg_dt = np.dtype(dtype) + X = dpt.asarray(0.1, dtype=arg_dt, sycl_queue=q) + expected_dtype = _map_to_device_dtype(arg_dt, q.sycl_device) + assert dpt_call(X).dtype == expected_dtype + + X = dpt.asarray(0.1, dtype=dtype, sycl_queue=q) + expected_dtype = _map_to_device_dtype(arg_dt, q.sycl_device) + Y = dpt.empty_like(X, dtype=expected_dtype) + dpt_call(X, out=Y) + assert_allclose(dpt.asnumpy(dpt_call(X)), dpt.asnumpy(Y)) + + +@pytest.mark.parametrize("np_call, dpt_call", _all_funcs) +@pytest.mark.parametrize("usm_type", ["device", "shared", "host"]) +def test_floor_ceil_trunc_usm_type(np_call, dpt_call, usm_type): + q = get_queue_or_skip() + + arg_dt = np.dtype("f4") + input_shape = (10, 10, 10, 10) + X = dpt.empty(input_shape, dtype=arg_dt, usm_type=usm_type, sycl_queue=q) + X[..., 0::2] = -0.4 + X[..., 1::2] = 0.7 + + Y = dpt_call(X) + assert Y.usm_type == X.usm_type + assert Y.sycl_queue == X.sycl_queue + assert Y.flags.c_contiguous + + expected_Y = np_call(dpt.asnumpy(X)) + tol = 8 * dpt.finfo(Y.dtype).resolution + assert_allclose(dpt.asnumpy(Y), expected_Y, atol=tol, rtol=tol) + + +@pytest.mark.parametrize("np_call, dpt_call", _all_funcs) +@pytest.mark.parametrize("dtype", _no_complex_dtypes) +def test_floor_ceil_trunc_order(np_call, dpt_call, dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + arg_dt = np.dtype(dtype) + input_shape = (4, 4, 4, 4) + X = dpt.empty(input_shape, dtype=arg_dt, sycl_queue=q) + X[..., 0::2] = -0.4 + X[..., 1::2] = 0.7 + + for perms in itertools.permutations(range(4)): + U = dpt.permute_dims(X[:, ::-1, ::-1, :], perms) + expected_Y = np_call(dpt.asnumpy(U)) + for ord in ["C", "F", "A", "K"]: + Y = dpt_call(U, order=ord) + assert_allclose(dpt.asnumpy(Y), expected_Y) + + +@pytest.mark.parametrize("dpt_call", [dpt.floor, dpt.ceil, dpt.trunc]) +@pytest.mark.parametrize("dtype", _real_value_dtypes) +def test_floor_ceil_trunc_error_dtype(dpt_call, dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + x = dpt.zeros(5, dtype=dtype) + y = dpt.empty_like(x, dtype="b1") + with pytest.raises(ValueError) as excinfo: + dpt_call(x, out=y) + assert re.match("Output array of type.*is needed", str(excinfo.value)) + + +@pytest.mark.parametrize("np_call, dpt_call", _all_funcs) +@pytest.mark.parametrize("dtype", _no_complex_dtypes) +def test_floor_ceil_trunc_contig(np_call, dpt_call, dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + n_seq = 100 + n_rep = 137 + Xnp = np.linspace(-99.9, 99.9, num=n_seq, dtype=dtype) + + X = dpt.asarray(np.repeat(Xnp, n_rep), dtype=dtype, sycl_queue=q) + Y = dpt_call(X) + + assert_allclose(dpt.asnumpy(Y), np.repeat(np_call(Xnp), n_rep)) + + Z = dpt.empty_like(X, dtype=dtype) + dpt_call(X, out=Z) + + assert_allclose(dpt.asnumpy(Z), np.repeat(np_call(Xnp), n_rep)) + + +@pytest.mark.parametrize("np_call, dpt_call", _all_funcs) +@pytest.mark.parametrize("dtype", _no_complex_dtypes) +def test_floor_ceil_trunc_strided(np_call, dpt_call, dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + np.random.seed(42) + strides = np.array([-4, -3, -2, -1, 1, 2, 3, 4]) + sizes = [2, 4, 6, 8, 24, 32, 72] + + for ii in sizes: + Xnp = np.random.uniform(low=-99.9, high=99.9, size=ii) + Xnp.astype(dtype) + X = dpt.asarray(Xnp) + Ynp = np_call(Xnp) + for jj in strides: + assert_allclose( + dpt.asnumpy(dpt_call(X[::jj])), + Ynp[::jj], + ) + + +@pytest.mark.parametrize("np_call, dpt_call", _all_funcs) +@pytest.mark.parametrize("dtype", ["f2", "f4", "f8"]) +def test_floor_ceil_trunc_special_cases(np_call, dpt_call, dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + x = [np.nan, np.inf, -np.inf, +0.0, -0.0] + + xf = np.array(x, dtype=dtype) + yf = dpt.asarray(xf, dtype=dtype, sycl_queue=q) + + Y_np = np_call(xf) + Y = dpt.asnumpy(dpt_call(yf)) + + tol = 8 * dpt.finfo(dtype).resolution + assert_allclose(Y, Y_np, atol=tol, rtol=tol) + assert_array_equal(np.signbit(Y), np.signbit(Y_np)) diff --git a/dpnp/tests/tensor/elementwise/test_floor_divide.py b/dpnp/tests/tensor/elementwise/test_floor_divide.py new file mode 100644 index 000000000000..5762b09afdb3 --- /dev/null +++ b/dpnp/tests/tensor/elementwise/test_floor_divide.py @@ -0,0 +1,317 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import ctypes + +import dpctl +import numpy as np +import pytest + +import dpnp.tensor as dpt +from dpnp.tensor._type_utils import _can_cast + +from ..helper import ( + get_queue_or_skip, + skip_if_dtype_not_supported, +) +from .utils import ( + _compare_dtypes, + _integral_dtypes, + _no_complex_dtypes, + _usm_types, +) + + +@pytest.mark.parametrize("op1_dtype", _no_complex_dtypes[1:]) +@pytest.mark.parametrize("op2_dtype", _no_complex_dtypes[1:]) +def test_floor_divide_dtype_matrix(op1_dtype, op2_dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(op1_dtype, q) + skip_if_dtype_not_supported(op2_dtype, q) + + sz = 127 + ar1 = dpt.ones(sz, dtype=op1_dtype) + ar2 = dpt.ones_like(ar1, dtype=op2_dtype) + + r = dpt.floor_divide(ar1, ar2) + assert isinstance(r, dpt.usm_ndarray) + expected = np.floor_divide( + np.ones(1, dtype=op1_dtype), np.ones(1, dtype=op2_dtype) + ) + assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q) + assert r.shape == ar1.shape + assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all() + assert r.sycl_queue == ar1.sycl_queue + + ar3 = dpt.ones(sz, dtype=op1_dtype) + ar4 = dpt.ones(2 * sz, dtype=op2_dtype) + + r = dpt.floor_divide(ar3[::-1], ar4[::2]) + assert isinstance(r, dpt.usm_ndarray) + expected = np.floor_divide( + np.ones(1, dtype=op1_dtype), np.ones(1, dtype=op2_dtype) + ) + assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q) + assert r.shape == ar3.shape + assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all() + + +@pytest.mark.parametrize("op1_usm_type", _usm_types) +@pytest.mark.parametrize("op2_usm_type", _usm_types) +def test_floor_divide_usm_type_matrix(op1_usm_type, op2_usm_type): + get_queue_or_skip() + + sz = 128 + ar1 = dpt.ones(sz, dtype="i4", usm_type=op1_usm_type) + ar2 = dpt.ones_like(ar1, dtype="i4", usm_type=op2_usm_type) + + r = dpt.floor_divide(ar1, ar2) + assert isinstance(r, dpt.usm_ndarray) + expected_usm_type = dpt.get_coerced_usm_type((op1_usm_type, op2_usm_type)) + assert r.usm_type == expected_usm_type + + +def test_floor_divide_order(): + get_queue_or_skip() + + ar1 = dpt.ones((20, 20), dtype="i4", order="C") + ar2 = dpt.ones((20, 20), dtype="i4", order="C") + r1 = dpt.floor_divide(ar1, ar2, order="C") + assert r1.flags.c_contiguous + r2 = dpt.floor_divide(ar1, ar2, order="F") + assert r2.flags.f_contiguous + r3 = dpt.floor_divide(ar1, ar2, order="A") + assert r3.flags.c_contiguous + r4 = dpt.floor_divide(ar1, ar2, order="K") + assert r4.flags.c_contiguous + + ar1 = dpt.ones((20, 20), dtype="i4", order="F") + ar2 = dpt.ones((20, 20), dtype="i4", order="F") + r1 = dpt.floor_divide(ar1, ar2, order="C") + assert r1.flags.c_contiguous + r2 = dpt.floor_divide(ar1, ar2, order="F") + assert r2.flags.f_contiguous + r3 = dpt.floor_divide(ar1, ar2, order="A") + assert r3.flags.f_contiguous + r4 = dpt.floor_divide(ar1, ar2, order="K") + assert r4.flags.f_contiguous + + ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2] + ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2] + r4 = dpt.floor_divide(ar1, ar2, order="K") + assert r4.strides == (20, -1) + + ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT + ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT + r4 = dpt.floor_divide(ar1, ar2, order="K") + assert r4.strides == (-1, 20) + + +def test_floor_divide_broadcasting(): + get_queue_or_skip() + + m = dpt.ones((100, 5), dtype="i4") + v = dpt.arange(1, 6, dtype="i4") + + r = dpt.floor_divide(m, v) + + expected = np.floor_divide( + np.ones((100, 5), dtype="i4"), np.arange(1, 6, dtype="i4") + ) + assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all() + + r2 = dpt.floor_divide(v, m) + expected2 = np.floor_divide( + np.arange(1, 6, dtype="i4"), np.ones((100, 5), dtype="i4") + ) + assert (dpt.asnumpy(r2) == expected2.astype(r2.dtype)).all() + + +@pytest.mark.parametrize("arr_dt", _no_complex_dtypes[1:]) +def test_floor_divide_python_scalar(arr_dt): + q = get_queue_or_skip() + skip_if_dtype_not_supported(arr_dt, q) + + X = dpt.ones((10, 10), dtype=arr_dt, sycl_queue=q) + py_ones = ( + bool(1), + int(1), + float(1), + np.float32(1), + ctypes.c_int(1), + ) + for sc in py_ones: + R = dpt.floor_divide(X, sc) + assert isinstance(R, dpt.usm_ndarray) + R = dpt.floor_divide(sc, X) + assert isinstance(R, dpt.usm_ndarray) + + +class MockArray: + def __init__(self, arr): + self.data_ = arr + + @property + def __sycl_usm_array_interface__(self): + return self.data_.__sycl_usm_array_interface__ + + +def test_floor_divide_mock_array(): + get_queue_or_skip() + a = dpt.arange(10) + b = dpt.ones(10) + c = MockArray(b) + r = dpt.floor_divide(a, c) + assert isinstance(r, dpt.usm_ndarray) + + +def test_floor_divide_canary_mock_array(): + get_queue_or_skip() + a = dpt.arange(10) + + class Canary: + def __init__(self): + pass + + @property + def __sycl_usm_array_interface__(self): + return None + + c = Canary() + with pytest.raises(ValueError): + dpt.floor_divide(a, c) + + +def test_floor_divide_gh_1247(): + get_queue_or_skip() + + x = dpt.ones(1, dtype="i4") + res = dpt.floor_divide(x, -2) + np.testing.assert_array_equal( + dpt.asnumpy(res), np.full(res.shape, -1, dtype=res.dtype) + ) + + x = dpt.full(1, -1, dtype="i4") + res = dpt.floor_divide(x, 2) + np.testing.assert_array_equal( + dpt.asnumpy(res), np.full(res.shape, -1, dtype=res.dtype) + ) + + +@pytest.mark.parametrize("dtype", _integral_dtypes) +def test_floor_divide_integer_zero(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + x = dpt.arange(10, dtype=dtype, sycl_queue=q) + y = dpt.zeros_like(x, sycl_queue=q) + res = dpt.floor_divide(x, y) + np.testing.assert_array_equal( + dpt.asnumpy(res), np.zeros(x.shape, dtype=res.dtype) + ) + + +def test_floor_divide_special_cases(): + q = get_queue_or_skip() + + x = dpt.empty(1, dtype="f4", sycl_queue=q) + y = dpt.empty_like(x) + x[0], y[0] = dpt.inf, dpt.inf + res = dpt.floor_divide(x, y) + with np.errstate(all="ignore"): + res_np = np.floor_divide(dpt.asnumpy(x), dpt.asnumpy(y)) + np.testing.assert_array_equal(dpt.asnumpy(res), res_np) + + x[0], y[0] = 0.0, -1.0 + res = dpt.floor_divide(x, y) + x_np = dpt.asnumpy(x) + y_np = dpt.asnumpy(y) + res_np = np.floor_divide(x_np, y_np) + np.testing.assert_array_equal(dpt.asnumpy(res), res_np) + + res = dpt.floor_divide(y, x) + with np.errstate(all="ignore"): + res_np = np.floor_divide(y_np, x_np) + np.testing.assert_array_equal(dpt.asnumpy(res), res_np) + + x[0], y[0] = -1.0, dpt.inf + res = dpt.floor_divide(x, y) + np.testing.assert_array_equal( + dpt.asnumpy(res), np.asarray([-0.0], dtype="f4") + ) + + res = dpt.floor_divide(y, x) + np.testing.assert_array_equal( + dpt.asnumpy(res), np.asarray([-dpt.inf], dtype="f4") + ) + + x[0], y[0] = 1.0, dpt.nan + res = dpt.floor_divide(x, y) + res_np = np.floor_divide(dpt.asnumpy(x), dpt.asnumpy(y)) + np.testing.assert_array_equal(dpt.asnumpy(res), res_np) + + +@pytest.mark.parametrize("dtype", _no_complex_dtypes[1:]) +def test_divide_inplace_python_scalar(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + X = dpt.zeros((10, 10), dtype=dtype, sycl_queue=q) + dt_kind = X.dtype.kind + if dt_kind in "ui": + X //= int(1) + elif dt_kind == "f": + X //= float(1) + + +@pytest.mark.parametrize("op1_dtype", _no_complex_dtypes[1:]) +@pytest.mark.parametrize("op2_dtype", _no_complex_dtypes[1:]) +def test_floor_divide_inplace_dtype_matrix(op1_dtype, op2_dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(op1_dtype, q) + skip_if_dtype_not_supported(op2_dtype, q) + + sz = 127 + ar1 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q) + ar2 = dpt.ones_like(ar1, dtype=op2_dtype, sycl_queue=q) + + dev = q.sycl_device + _fp16 = dev.has_aspect_fp16 + _fp64 = dev.has_aspect_fp64 + # out array only valid if it is inexact + if _can_cast(ar2.dtype, ar1.dtype, _fp16, _fp64, casting="same_kind"): + ar1 //= ar2 + assert dpt.all(ar1 == 1) + + ar3 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)[::-1] + ar4 = dpt.ones(2 * sz, dtype=op2_dtype, sycl_queue=q)[::2] + ar3 //= ar4 + assert dpt.all(ar3 == 1) + else: + with pytest.raises(ValueError): + ar1 //= ar2 + dpt.floor_divide(ar1, ar2, out=ar1) diff --git a/dpnp/tests/tensor/elementwise/test_greater.py b/dpnp/tests/tensor/elementwise/test_greater.py new file mode 100644 index 000000000000..eb5f2b3929df --- /dev/null +++ b/dpnp/tests/tensor/elementwise/test_greater.py @@ -0,0 +1,314 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import ctypes + +import dpctl +import numpy as np +import pytest + +import dpnp.tensor as dpt + +from ..helper import ( + get_queue_or_skip, + skip_if_dtype_not_supported, +) +from .utils import ( + _all_dtypes, + _compare_dtypes, + _usm_types, +) + + +@pytest.mark.parametrize("op1_dtype", _all_dtypes) +@pytest.mark.parametrize("op2_dtype", _all_dtypes) +def test_greater_dtype_matrix(op1_dtype, op2_dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(op1_dtype, q) + skip_if_dtype_not_supported(op2_dtype, q) + + sz = 127 + ar1 = dpt.zeros(sz, dtype=op1_dtype) + ar2 = dpt.ones_like(ar1, dtype=op2_dtype) + + r = dpt.greater(ar1, ar2) + assert isinstance(r, dpt.usm_ndarray) + expected = np.greater( + np.zeros(1, dtype=op1_dtype), np.ones(1, dtype=op2_dtype) + ) + assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q) + assert r.shape == ar1.shape + assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all() + assert r.sycl_queue == ar1.sycl_queue + + ar3 = dpt.zeros(sz, dtype=op1_dtype) + ar4 = dpt.ones(2 * sz, dtype=op2_dtype) + + r = dpt.greater(ar3[::-1], ar4[::2]) + assert isinstance(r, dpt.usm_ndarray) + expected = np.greater( + np.zeros(1, dtype=op1_dtype), np.ones(1, dtype=op2_dtype) + ) + assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q) + assert r.shape == ar3.shape + assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all() + + +@pytest.mark.parametrize("op_dtype", ["c8", "c16"]) +def test_greater_complex_matrix(op_dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(op_dtype, q) + + sz = 127 + ar1_np_real = np.random.randint(0, 10, sz) + ar1_np_imag = np.random.randint(0, 10, sz) + ar1_np = ar1_np_real + 1j * ar1_np_imag + ar1 = dpt.asarray(ar1_np, dtype=op_dtype) + + ar2_np_real = np.random.randint(0, 10, sz) + ar2_np_imag = np.random.randint(0, 10, sz) + ar2_np = ar2_np_real + 1j * ar2_np_imag + ar2 = dpt.asarray(ar2_np, dtype=op_dtype) + + r = dpt.greater(ar1, ar2) + expected = np.greater(ar1_np, ar2_np) + assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q) + assert r.shape == expected.shape + assert (dpt.asnumpy(r) == expected).all() + + r1 = dpt.greater(ar1[::-2], ar2[::2]) + expected1 = np.greater(ar1_np[::-2], ar2_np[::2]) + assert _compare_dtypes(r.dtype, expected1.dtype, sycl_queue=q) + assert r1.shape == expected1.shape + assert (dpt.asnumpy(r1) == expected1).all() + + ar3 = dpt.asarray([1.0 + 9j, 2.0 + 0j, 2.0 + 1j, 2.0 + 2j], dtype=op_dtype) + ar4 = dpt.asarray([2.0 + 0j, dpt.nan, dpt.inf, -dpt.inf], dtype=op_dtype) + + ar3_np = dpt.asnumpy(ar3) + ar4_np = dpt.asnumpy(ar4) + + r2 = dpt.greater(ar3, ar4) + with np.errstate(invalid="ignore"): + expected2 = np.greater(ar3_np, ar4_np) + assert (dpt.asnumpy(r2) == expected2).all() + + r3 = dpt.greater(ar4, ar4) + with np.errstate(invalid="ignore"): + expected3 = np.greater(ar4_np, ar4_np) + assert (dpt.asnumpy(r3) == expected3).all() + + +def test_greater_complex_float(): + get_queue_or_skip() + + ar1 = dpt.asarray([1.0 + 9j, 2.0 + 0j, 2.0 + 1j, 2.0 + 2j], dtype="c8") + ar2 = dpt.full((4,), 2, dtype="f4") + + ar1_np = dpt.asnumpy(ar1) + ar2_np = dpt.asnumpy(ar2) + + r = dpt.greater(ar1, ar2) + expected = np.greater(ar1_np, ar2_np) + assert (dpt.asnumpy(r) == expected).all() + + r1 = dpt.greater(ar2, ar1) + expected1 = np.greater(ar2_np, ar1_np) + assert (dpt.asnumpy(r1) == expected1).all() + with np.errstate(invalid="ignore"): + for tp in [dpt.nan, dpt.inf, -dpt.inf]: + + ar3 = dpt.full((4,), tp) + ar3_np = dpt.asnumpy(ar3) + + r2 = dpt.greater(ar1, ar3) + expected2 = np.greater(ar1_np, ar3_np) + assert (dpt.asnumpy(r2) == expected2).all() + + r3 = dpt.greater(ar3, ar1) + expected3 = np.greater(ar3_np, ar1_np) + assert (dpt.asnumpy(r3) == expected3).all() + + +@pytest.mark.parametrize("op1_usm_type", _usm_types) +@pytest.mark.parametrize("op2_usm_type", _usm_types) +def test_greater_usm_type_matrix(op1_usm_type, op2_usm_type): + get_queue_or_skip() + + sz = 128 + ar1 = dpt.ones(sz, dtype="i4", usm_type=op1_usm_type) + ar2 = dpt.ones_like(ar1, dtype="i4", usm_type=op2_usm_type) + + r = dpt.greater(ar1, ar2) + assert isinstance(r, dpt.usm_ndarray) + expected_usm_type = dpt.get_coerced_usm_type((op1_usm_type, op2_usm_type)) + assert r.usm_type == expected_usm_type + + +def test_greater_order(): + get_queue_or_skip() + + ar1 = dpt.ones((20, 20), dtype="i4", order="C") + ar2 = dpt.ones((20, 20), dtype="i4", order="C") + r1 = dpt.greater(ar1, ar2, order="C") + assert r1.flags.c_contiguous + r2 = dpt.greater(ar1, ar2, order="F") + assert r2.flags.f_contiguous + r3 = dpt.greater(ar1, ar2, order="A") + assert r3.flags.c_contiguous + r4 = dpt.greater(ar1, ar2, order="K") + assert r4.flags.c_contiguous + + ar1 = dpt.ones((20, 20), dtype="i4", order="F") + ar2 = dpt.ones((20, 20), dtype="i4", order="F") + r1 = dpt.greater(ar1, ar2, order="C") + assert r1.flags.c_contiguous + r2 = dpt.greater(ar1, ar2, order="F") + assert r2.flags.f_contiguous + r3 = dpt.greater(ar1, ar2, order="A") + assert r3.flags.f_contiguous + r4 = dpt.greater(ar1, ar2, order="K") + assert r4.flags.f_contiguous + + ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2] + ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2] + r4 = dpt.greater(ar1, ar2, order="K") + assert r4.strides == (20, -1) + + ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT + ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT + r4 = dpt.greater(ar1, ar2, order="K") + assert r4.strides == (-1, 20) + + +def test_greater_broadcasting(): + get_queue_or_skip() + + m = dpt.ones((100, 5), dtype="i4") + v = dpt.arange(1, 6, dtype="i4") + + r = dpt.greater(m, v) + + expected = np.greater( + np.ones((100, 5), dtype="i4"), np.arange(1, 6, dtype="i4") + ) + assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all() + + r2 = dpt.greater(v, m) + expected2 = np.greater( + np.arange(1, 6, dtype="i4"), np.ones((100, 5), dtype="i4") + ) + assert (dpt.asnumpy(r2) == expected2.astype(r2.dtype)).all() + + +@pytest.mark.parametrize("arr_dt", _all_dtypes) +def test_greater_python_scalar(arr_dt): + q = get_queue_or_skip() + skip_if_dtype_not_supported(arr_dt, q) + + X = dpt.ones((10, 10), dtype=arr_dt, sycl_queue=q) + py_ones = ( + bool(1), + int(1), + float(1), + complex(1), + np.float32(1), + ctypes.c_int(1), + ) + for sc in py_ones: + R = dpt.greater(X, sc) + assert isinstance(R, dpt.usm_ndarray) + R = dpt.greater(sc, X) + assert isinstance(R, dpt.usm_ndarray) + + +class MockArray: + def __init__(self, arr): + self.data_ = arr + + @property + def __sycl_usm_array_interface__(self): + return self.data_.__sycl_usm_array_interface__ + + +def test_greater_mock_array(): + get_queue_or_skip() + a = dpt.arange(10) + b = dpt.ones(10) + c = MockArray(b) + r = dpt.greater(a, c) + assert isinstance(r, dpt.usm_ndarray) + + +def test_greater_canary_mock_array(): + get_queue_or_skip() + a = dpt.arange(10) + + class Canary: + def __init__(self): + pass + + @property + def __sycl_usm_array_interface__(self): + return None + + c = Canary() + with pytest.raises(ValueError): + dpt.greater(a, c) + + +def test_greater_mixed_integer_kinds(): + get_queue_or_skip() + + x1 = dpt.flip(dpt.arange(-9, 1, dtype="i8")) + x2 = dpt.arange(10, dtype="u8") + + # u8 - i8 + res = dpt.greater(x2, x1) + assert dpt.all(res[1:]) + assert not res[0] + # i8 - u8 + assert not dpt.any(dpt.greater(x1, x2)) + + # Python scalar + assert dpt.all(dpt.greater(x2, -1)) + assert not dpt.any(dpt.greater(-1, x2)) + + +def test_greater_very_large_py_int(): + get_queue_or_skip() + + py_int = dpt.iinfo(dpt.int64).max + 10 + + x = dpt.asarray(3, dtype="u8") + assert py_int > x + assert not dpt.greater(x, py_int) + + x = dpt.asarray(py_int, dtype="u8") + assert x > -1 + assert not dpt.greater(-1, x) diff --git a/dpnp/tests/tensor/elementwise/test_greater_equal.py b/dpnp/tests/tensor/elementwise/test_greater_equal.py new file mode 100644 index 000000000000..f2e97bf62189 --- /dev/null +++ b/dpnp/tests/tensor/elementwise/test_greater_equal.py @@ -0,0 +1,313 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import ctypes + +import dpctl +import numpy as np +import pytest + +import dpnp.tensor as dpt + +from ..helper import ( + get_queue_or_skip, + skip_if_dtype_not_supported, +) +from .utils import ( + _all_dtypes, + _compare_dtypes, + _usm_types, +) + + +@pytest.mark.parametrize("op1_dtype", _all_dtypes) +@pytest.mark.parametrize("op2_dtype", _all_dtypes) +def test_greater_equal_dtype_matrix(op1_dtype, op2_dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(op1_dtype, q) + skip_if_dtype_not_supported(op2_dtype, q) + + sz = 127 + ar1 = dpt.zeros(sz, dtype=op1_dtype) + ar2 = dpt.ones_like(ar1, dtype=op2_dtype) + + r = dpt.greater_equal(ar1, ar2) + assert isinstance(r, dpt.usm_ndarray) + expected = np.greater_equal( + np.zeros(1, dtype=op1_dtype), np.ones(1, dtype=op2_dtype) + ) + assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q) + assert r.shape == ar1.shape + assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all() + assert r.sycl_queue == ar1.sycl_queue + + ar3 = dpt.zeros(sz, dtype=op1_dtype) + ar4 = dpt.ones(2 * sz, dtype=op2_dtype) + + r = dpt.greater_equal(ar3[::-1], ar4[::2]) + assert isinstance(r, dpt.usm_ndarray) + expected = np.greater_equal( + np.zeros(1, dtype=op1_dtype), np.ones(1, dtype=op2_dtype) + ) + assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q) + assert r.shape == ar3.shape + assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all() + + +@pytest.mark.parametrize("op_dtype", ["c8", "c16"]) +def test_greater_equal_complex_matrix(op_dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(op_dtype, q) + + sz = 127 + ar1_np_real = np.random.randint(0, 10, sz) + ar1_np_imag = np.random.randint(0, 10, sz) + ar1_np = ar1_np_real + 1j * ar1_np_imag + ar1 = dpt.asarray(ar1_np, dtype=op_dtype) + + ar2_np_real = np.random.randint(0, 10, sz) + ar2_np_imag = np.random.randint(0, 10, sz) + ar2_np = ar2_np_real + 1j * ar2_np_imag + ar2 = dpt.asarray(ar2_np, dtype=op_dtype) + + r = dpt.greater_equal(ar1, ar2) + expected = np.greater_equal(ar1_np, ar2_np) + assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q) + assert r.shape == expected.shape + assert (dpt.asnumpy(r) == expected).all() + + r1 = dpt.greater_equal(ar1[::-2], ar2[::2]) + expected1 = np.greater_equal(ar1_np[::-2], ar2_np[::2]) + assert _compare_dtypes(r.dtype, expected1.dtype, sycl_queue=q) + assert r1.shape == expected1.shape + assert (dpt.asnumpy(r1) == expected1).all() + + ar3 = dpt.asarray([1.0 + 9j, 2.0 + 0j, 2.0 + 1j, 2.0 + 2j], dtype=op_dtype) + ar4 = dpt.asarray([2.0 + 0j, dpt.nan, dpt.inf, -dpt.inf], dtype=op_dtype) + + ar3_np = dpt.asnumpy(ar3) + ar4_np = dpt.asnumpy(ar4) + r2 = dpt.greater_equal(ar3, ar4) + with np.errstate(invalid="ignore"): + expected2 = np.greater_equal(ar3_np, ar4_np) + assert (dpt.asnumpy(r2) == expected2).all() + + r3 = dpt.greater_equal(ar4, ar4) + with np.errstate(invalid="ignore"): + expected3 = np.greater_equal(ar4_np, ar4_np) + assert (dpt.asnumpy(r3) == expected3).all() + + +def test_greater_equal_complex_float(): + get_queue_or_skip() + + ar1 = dpt.asarray([1.0 + 9j, 2.0 + 0j, 2.0 + 1j, 2.0 + 2j], dtype="c8") + ar2 = dpt.full((4,), 2, dtype="f4") + + ar1_np = dpt.asnumpy(ar1) + ar2_np = dpt.asnumpy(ar2) + + r = dpt.greater_equal(ar1, ar2) + expected = np.greater_equal(ar1_np, ar2_np) + assert (dpt.asnumpy(r) == expected).all() + + r1 = dpt.greater_equal(ar2, ar1) + expected1 = np.greater_equal(ar2_np, ar1_np) + assert (dpt.asnumpy(r1) == expected1).all() + with np.errstate(invalid="ignore"): + for tp in [dpt.nan, dpt.inf, -dpt.inf]: + + ar3 = dpt.full((4,), tp) + ar3_np = dpt.asnumpy(ar3) + r2 = dpt.greater_equal(ar1, ar3) + expected2 = np.greater_equal(ar1_np, ar3_np) + assert (dpt.asnumpy(r2) == expected2).all() + + r3 = dpt.greater_equal(ar3, ar1) + expected3 = np.greater_equal(ar3_np, ar1_np) + assert (dpt.asnumpy(r3) == expected3).all() + + +@pytest.mark.parametrize("op1_usm_type", _usm_types) +@pytest.mark.parametrize("op2_usm_type", _usm_types) +def test_greater_equal_usm_type_matrix(op1_usm_type, op2_usm_type): + get_queue_or_skip() + + sz = 128 + ar1 = dpt.ones(sz, dtype="i4", usm_type=op1_usm_type) + ar2 = dpt.ones_like(ar1, dtype="i4", usm_type=op2_usm_type) + + r = dpt.greater_equal(ar1, ar2) + assert isinstance(r, dpt.usm_ndarray) + expected_usm_type = dpt.get_coerced_usm_type((op1_usm_type, op2_usm_type)) + assert r.usm_type == expected_usm_type + + +def test_greater_equal_order(): + get_queue_or_skip() + + ar1 = dpt.ones((20, 20), dtype="i4", order="C") + ar2 = dpt.ones((20, 20), dtype="i4", order="C") + r1 = dpt.greater_equal(ar1, ar2, order="C") + assert r1.flags.c_contiguous + r2 = dpt.greater_equal(ar1, ar2, order="F") + assert r2.flags.f_contiguous + r3 = dpt.greater_equal(ar1, ar2, order="A") + assert r3.flags.c_contiguous + r4 = dpt.greater_equal(ar1, ar2, order="K") + assert r4.flags.c_contiguous + + ar1 = dpt.ones((20, 20), dtype="i4", order="F") + ar2 = dpt.ones((20, 20), dtype="i4", order="F") + r1 = dpt.greater_equal(ar1, ar2, order="C") + assert r1.flags.c_contiguous + r2 = dpt.greater_equal(ar1, ar2, order="F") + assert r2.flags.f_contiguous + r3 = dpt.greater_equal(ar1, ar2, order="A") + assert r3.flags.f_contiguous + r4 = dpt.greater_equal(ar1, ar2, order="K") + assert r4.flags.f_contiguous + + ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2] + ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2] + r4 = dpt.greater_equal(ar1, ar2, order="K") + assert r4.strides == (20, -1) + + ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT + ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT + r4 = dpt.greater_equal(ar1, ar2, order="K") + assert r4.strides == (-1, 20) + + +def test_greater_equal_broadcasting(): + get_queue_or_skip() + + m = dpt.ones((100, 5), dtype="i4") + v = dpt.arange(1, 6, dtype="i4") + + r = dpt.greater_equal(m, v) + + expected = np.greater_equal( + np.ones((100, 5), dtype="i4"), np.arange(1, 6, dtype="i4") + ) + assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all() + + r2 = dpt.greater_equal(v, m) + expected2 = np.greater_equal( + np.arange(1, 6, dtype="i4"), np.ones((100, 5), dtype="i4") + ) + assert (dpt.asnumpy(r2) == expected2.astype(r2.dtype)).all() + + +@pytest.mark.parametrize("arr_dt", _all_dtypes) +def test_greater_equal_python_scalar(arr_dt): + q = get_queue_or_skip() + skip_if_dtype_not_supported(arr_dt, q) + + X = dpt.ones((10, 10), dtype=arr_dt, sycl_queue=q) + py_ones = ( + bool(1), + int(1), + float(1), + complex(1), + np.float32(1), + ctypes.c_int(1), + ) + for sc in py_ones: + R = dpt.greater_equal(X, sc) + assert isinstance(R, dpt.usm_ndarray) + R = dpt.greater_equal(sc, X) + assert isinstance(R, dpt.usm_ndarray) + + +class MockArray: + def __init__(self, arr): + self.data_ = arr + + @property + def __sycl_usm_array_interface__(self): + return self.data_.__sycl_usm_array_interface__ + + +def test_greater_equal_mock_array(): + get_queue_or_skip() + a = dpt.arange(10) + b = dpt.ones(10) + c = MockArray(b) + r = dpt.greater_equal(a, c) + assert isinstance(r, dpt.usm_ndarray) + + +def test_greater_equal_canary_mock_array(): + get_queue_or_skip() + a = dpt.arange(10) + + class Canary: + def __init__(self): + pass + + @property + def __sycl_usm_array_interface__(self): + return None + + c = Canary() + with pytest.raises(ValueError): + dpt.greater_equal(a, c) + + +def test_greater_equal_mixed_integer_kinds(): + get_queue_or_skip() + + x1 = dpt.flip(dpt.arange(-9, 1, dtype="i8")) + x2 = dpt.arange(10, dtype="u8") + + # u8 - i8 + res = dpt.greater_equal(x2, x1) + assert dpt.all(res) + # i8 - u8 + res = dpt.greater_equal(x1, x2) + assert not dpt.any(res[1:]) + assert res[0] + + # Python scalar + assert dpt.all(dpt.greater_equal(x2, -1)) + assert not dpt.any(dpt.greater_equal(-1, x2)) + + +def test_greater_equal_very_large_py_int(): + get_queue_or_skip() + + py_int = dpt.iinfo(dpt.int64).max + 10 + + x = dpt.asarray(3, dtype="u8") + assert py_int >= x + assert not dpt.greater_equal(x, py_int) + + x = dpt.asarray(py_int, dtype="u8") + assert x >= -1 + assert not dpt.greater_equal(-1, x) diff --git a/dpnp/tests/tensor/elementwise/test_hyperbolic.py b/dpnp/tests/tensor/elementwise/test_hyperbolic.py new file mode 100644 index 000000000000..b94c5ede3f2a --- /dev/null +++ b/dpnp/tests/tensor/elementwise/test_hyperbolic.py @@ -0,0 +1,202 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import numpy as np +import pytest +from numpy.testing import assert_allclose + +import dpnp.tensor as dpt + +from ..helper import ( + get_queue_or_skip, + skip_if_dtype_not_supported, +) +from .utils import ( + _all_dtypes, + _map_to_device_dtype, +) + +_hyper_funcs = [(np.sinh, dpt.sinh), (np.cosh, dpt.cosh), (np.tanh, dpt.tanh)] +_inv_hyper_funcs = [ + (np.arcsinh, dpt.asinh), + (np.arccosh, dpt.acosh), + (np.arctanh, dpt.atanh), +] +_all_funcs = _hyper_funcs + _inv_hyper_funcs + + +@pytest.mark.parametrize("np_call, dpt_call", _all_funcs) +@pytest.mark.parametrize("dtype", _all_dtypes) +def test_hyper_out_type(np_call, dpt_call, dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + a = 1 if np_call == np.arccosh else 0 + + x = dpt.asarray(a, dtype=dtype, sycl_queue=q) + expected_dtype = np_call(np.array(a, dtype=dtype)).dtype + expected_dtype = _map_to_device_dtype(expected_dtype, q.sycl_device) + assert dpt_call(x).dtype == expected_dtype + + +@pytest.mark.parametrize("np_call, dpt_call", _all_funcs) +@pytest.mark.parametrize("dtype", ["f2", "f4", "f8"]) +def test_hyper_real_contig(np_call, dpt_call, dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + n_seq = 100 + n_rep = 137 + if np_call == np.arctanh: + Xnp = np.linspace(-0.9, 0.9, num=n_seq, dtype=dtype) + elif np_call == np.arccosh: + Xnp = np.linspace(1.01, 10.0, num=n_seq, dtype=dtype) + else: + Xnp = np.linspace(-10.0, 10.0, num=n_seq, dtype=dtype) + + X = dpt.asarray(np.repeat(Xnp, n_rep), dtype=dtype, sycl_queue=q) + Y = dpt_call(X) + + tol = 8 * dpt.finfo(Y.dtype).resolution + assert_allclose( + dpt.asnumpy(Y), np.repeat(np_call(Xnp), n_rep), atol=tol, rtol=tol + ) + + Z = dpt.empty_like(X, dtype=dtype) + dpt_call(X, out=Z) + + assert_allclose( + dpt.asnumpy(Z), np.repeat(np_call(Xnp), n_rep), atol=tol, rtol=tol + ) + + +@pytest.mark.parametrize("np_call, dpt_call", _all_funcs) +@pytest.mark.parametrize("dtype", ["c8", "c16"]) +def test_hyper_complex_contig(np_call, dpt_call, dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + n_seq = 100 + n_rep = 137 + low = -9.0 + high = 9.0 + x1 = np.random.uniform(low=low, high=high, size=n_seq) + x2 = np.random.uniform(low=low, high=high, size=n_seq) + Xnp = x1 + 1j * x2 + + X = dpt.asarray(np.repeat(Xnp, n_rep), dtype=dtype, sycl_queue=q) + Y = dpt_call(X) + + expected = np.repeat(np_call(Xnp), n_rep).astype(dtype) + tol = 50 * dpt.finfo(dtype).resolution + assert_allclose(dpt.asnumpy(Y), expected, atol=tol, rtol=tol) + + Z = dpt.empty_like(X, dtype=dtype) + dpt_call(X, out=Z) + + assert_allclose(dpt.asnumpy(Z), expected, atol=tol, rtol=tol) + + +@pytest.mark.parametrize("np_call, dpt_call", _all_funcs) +@pytest.mark.parametrize("dtype", ["f2", "f4", "f8"]) +def test_hyper_real_strided(np_call, dpt_call, dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + np.random.seed(42) + strides = np.array([-4, -3, -2, -1, 1, 2, 3, 4]) + sizes = [2, 4, 6, 8, 9, 24, 72] + tol = 8 * dpt.finfo(dtype).resolution + + low = -10.0 + high = 10.0 + if np_call == np.arctanh: + low = -0.9 + high = 0.9 + elif np_call == np.arccosh: + low = 1.01 + high = 100.0 + + for ii in sizes: + Xnp = np.random.uniform(low=low, high=high, size=ii) + Xnp.astype(dtype) + X = dpt.asarray(Xnp) + Ynp = np_call(Xnp) + for jj in strides: + assert_allclose( + dpt.asnumpy(dpt_call(X[::jj])), + Ynp[::jj], + atol=tol, + rtol=tol, + ) + + +@pytest.mark.parametrize("np_call, dpt_call", _all_funcs) +@pytest.mark.parametrize("dtype", ["c8", "c16"]) +def test_hyper_complex_strided(np_call, dpt_call, dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + np.random.seed(42) + strides = np.array([-4, -3, -2, -1, 1, 2, 3, 4]) + sizes = [2, 4, 6, 8, 9, 24, 72] + tol = 50 * dpt.finfo(dtype).resolution + + low = -8.0 + high = 8.0 + for ii in sizes: + x1 = np.random.uniform(low=low, high=high, size=ii) + x2 = np.random.uniform(low=low, high=high, size=ii) + Xnp = np.array([complex(v1, v2) for v1, v2 in zip(x1, x2)], dtype=dtype) + X = dpt.asarray(Xnp) + Ynp = np_call(Xnp) + for jj in strides: + assert_allclose( + dpt.asnumpy(dpt_call(X[::jj])), + Ynp[::jj], + atol=tol, + rtol=tol, + ) + + +@pytest.mark.parametrize("np_call, dpt_call", _all_funcs) +@pytest.mark.parametrize("dtype", ["f2", "f4", "f8"]) +def test_hyper_real_special_cases(np_call, dpt_call, dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + x = [np.nan, np.inf, -np.inf, 2.0, -2.0, +0.0, -0.0, +1.0, -1.0] + + xf = np.array(x, dtype=dtype) + yf = dpt.asarray(xf, dtype=dtype, sycl_queue=q) + + with np.errstate(all="ignore"): + Y_np = np_call(xf) + + tol = 8 * dpt.finfo(dtype).resolution + assert_allclose(dpt.asnumpy(dpt_call(yf)), Y_np, atol=tol, rtol=tol) diff --git a/dpnp/tests/tensor/elementwise/test_hypot.py b/dpnp/tests/tensor/elementwise/test_hypot.py new file mode 100644 index 000000000000..bc87736318ee --- /dev/null +++ b/dpnp/tests/tensor/elementwise/test_hypot.py @@ -0,0 +1,210 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import ctypes + +import dpctl +import numpy as np +import pytest + +import dpnp.tensor as dpt + +from ..helper import ( + get_queue_or_skip, + skip_if_dtype_not_supported, +) +from .utils import ( + _compare_dtypes, + _no_complex_dtypes, + _usm_types, +) + + +@pytest.mark.parametrize("op1_dtype", _no_complex_dtypes[1:]) +@pytest.mark.parametrize("op2_dtype", _no_complex_dtypes[1:]) +def test_hypot_dtype_matrix(op1_dtype, op2_dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(op1_dtype, q) + skip_if_dtype_not_supported(op2_dtype, q) + + sz = 127 + ar1 = dpt.zeros(sz, dtype=op1_dtype, sycl_queue=q) + ar2 = dpt.zeros_like(ar1, dtype=op2_dtype, sycl_queue=q) + + r = dpt.hypot(ar1, ar2) + assert isinstance(r, dpt.usm_ndarray) + expected = np.hypot( + np.zeros(1, dtype=op1_dtype), np.zeros(1, dtype=op2_dtype) + ) + assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q) + assert r.shape == ar1.shape + assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all() + assert r.sycl_queue == ar1.sycl_queue + + ar3 = dpt.zeros(sz, dtype=op1_dtype, sycl_queue=q) + ar4 = dpt.zeros(2 * sz, dtype=op2_dtype, sycl_queue=q) + + r = dpt.hypot(ar3[::-1], ar4[::2]) + assert isinstance(r, dpt.usm_ndarray) + expected = np.hypot( + np.zeros(1, dtype=op1_dtype), np.zeros(1, dtype=op2_dtype) + ) + assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q) + assert r.shape == ar3.shape + assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all() + + +@pytest.mark.parametrize("op1_usm_type", _usm_types) +@pytest.mark.parametrize("op2_usm_type", _usm_types) +def test_hypot_usm_type_matrix(op1_usm_type, op2_usm_type): + get_queue_or_skip() + + sz = 128 + ar1 = dpt.ones(sz, dtype="i4", usm_type=op1_usm_type) + ar2 = dpt.ones_like(ar1, dtype="i4", usm_type=op2_usm_type) + + r = dpt.hypot(ar1, ar2) + assert isinstance(r, dpt.usm_ndarray) + expected_usm_type = dpt.get_coerced_usm_type((op1_usm_type, op2_usm_type)) + assert r.usm_type == expected_usm_type + + +def test_hypot_order(): + get_queue_or_skip() + + ar1 = dpt.ones((20, 20), dtype="i4", order="C") + ar2 = dpt.ones((20, 20), dtype="i4", order="C") + r1 = dpt.hypot(ar1, ar2, order="C") + assert r1.flags.c_contiguous + r2 = dpt.hypot(ar1, ar2, order="F") + assert r2.flags.f_contiguous + r3 = dpt.hypot(ar1, ar2, order="A") + assert r3.flags.c_contiguous + r4 = dpt.hypot(ar1, ar2, order="K") + assert r4.flags.c_contiguous + + ar1 = dpt.ones((20, 20), dtype="i4", order="F") + ar2 = dpt.ones((20, 20), dtype="i4", order="F") + r1 = dpt.hypot(ar1, ar2, order="C") + assert r1.flags.c_contiguous + r2 = dpt.hypot(ar1, ar2, order="F") + assert r2.flags.f_contiguous + r3 = dpt.hypot(ar1, ar2, order="A") + assert r3.flags.f_contiguous + r4 = dpt.hypot(ar1, ar2, order="K") + assert r4.flags.f_contiguous + + ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2] + ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2] + r4 = dpt.hypot(ar1, ar2, order="K") + assert r4.strides == (20, -1) + + ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT + ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT + r4 = dpt.hypot(ar1, ar2, order="K") + assert r4.strides == (-1, 20) + + +def test_hypot_broadcasting(): + get_queue_or_skip() + + m = dpt.ones((100, 5), dtype="i4") + v = dpt.arange(1, 6, dtype="i4") + + r = dpt.hypot(m, v) + + expected = np.hypot( + np.ones((100, 5), dtype="i4"), np.arange(1, 6, dtype="i4") + ) + tol = 8 * np.finfo(r.dtype).resolution + assert np.allclose( + dpt.asnumpy(r), expected.astype(r.dtype), atol=tol, rtol=tol + ) + + r2 = dpt.hypot(v, m) + expected2 = np.hypot( + np.arange(1, 6, dtype="i4"), np.ones((100, 5), dtype="i4") + ) + assert np.allclose( + dpt.asnumpy(r2), expected2.astype(r2.dtype), atol=tol, rtol=tol + ) + + +@pytest.mark.parametrize("arr_dt", _no_complex_dtypes[1:]) +def test_hypot_python_scalar(arr_dt): + q = get_queue_or_skip() + skip_if_dtype_not_supported(arr_dt, q) + + X = dpt.ones((10, 10), dtype=arr_dt, sycl_queue=q) + py_ones = ( + bool(1), + int(1), + float(1), + np.float32(1), + ctypes.c_int(1), + ) + for sc in py_ones: + R = dpt.hypot(X, sc) + assert isinstance(R, dpt.usm_ndarray) + R = dpt.hypot(sc, X) + assert isinstance(R, dpt.usm_ndarray) + + +class MockArray: + def __init__(self, arr): + self.data_ = arr + + @property + def __sycl_usm_array_interface__(self): + return self.data_.__sycl_usm_array_interface__ + + +def test_hypot_mock_array(): + get_queue_or_skip() + a = dpt.arange(10) + b = dpt.ones(10) + c = MockArray(b) + r = dpt.hypot(a, c) + assert isinstance(r, dpt.usm_ndarray) + + +def test_hypot_canary_mock_array(): + get_queue_or_skip() + a = dpt.arange(10) + + class Canary: + def __init__(self): + pass + + @property + def __sycl_usm_array_interface__(self): + return None + + c = Canary() + with pytest.raises(ValueError): + dpt.hypot(a, c) diff --git a/dpnp/tests/tensor/elementwise/test_isfinite.py b/dpnp/tests/tensor/elementwise/test_isfinite.py new file mode 100644 index 000000000000..f3a6664e6916 --- /dev/null +++ b/dpnp/tests/tensor/elementwise/test_isfinite.py @@ -0,0 +1,114 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import itertools + +import numpy as np +import pytest +from numpy.testing import assert_allclose + +import dpnp.tensor as dpt + +from ..helper import ( + get_queue_or_skip, + skip_if_dtype_not_supported, +) +from .utils import _all_dtypes + + +@pytest.mark.parametrize("dtype", _all_dtypes) +def test_isfinite_out_type(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + X = dpt.asarray(0, dtype=dtype, sycl_queue=q) + assert dpt.isfinite(X).dtype == dpt.bool + + +def test_isfinite_output(): + q = get_queue_or_skip() + + Xnp = np.asarray(np.nan) + X = dpt.asarray(np.nan, sycl_queue=q) + assert dpt.asnumpy(dpt.isfinite(X)) == np.isfinite(Xnp) + + +@pytest.mark.parametrize("dtype", ["c8", "c16"]) +def test_isfinite_complex(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + y1 = complex(np.nan, np.nan) + y2 = complex(1, np.nan) + y3 = complex(np.nan, 1) + y4 = complex(2, 1) + y5 = complex(np.inf, 1) + + Ynp = np.repeat(np.array([y1, y2, y3, y4, y5], dtype=dtype), 12) + Y = dpt.asarray(Ynp, sycl_queue=q) + assert np.array_equal(dpt.asnumpy(dpt.isfinite(Y)), np.isfinite(Ynp)) + + r = dpt.empty_like(Y, dtype="bool") + dpt.isfinite(Y, out=r) + assert np.array_equal(dpt.asnumpy(r)[()], np.isfinite(Ynp)) + + +@pytest.mark.parametrize("dtype", ["f2", "f4", "f8"]) +def test_isfinite_floats(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + y1 = np.nan + y2 = 1 + y3 = np.inf + + for mult in [123, 137, 255, 271, 272]: + Ynp = np.repeat(np.array([y1, y2, y3], dtype=dtype), mult) + Y = dpt.asarray(Ynp, sycl_queue=q) + assert np.array_equal(dpt.asnumpy(dpt.isfinite(Y)), np.isfinite(Ynp)) + + r = dpt.empty_like(Y, dtype="bool") + dpt.isfinite(Y, out=r) + assert np.array_equal(dpt.asnumpy(r)[()], np.isfinite(Ynp)) + + +@pytest.mark.parametrize("dtype", _all_dtypes) +def test_isfinite_order(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + arg_dt = np.dtype(dtype) + input_shape = (10, 10, 10, 10) + X = dpt.ones(input_shape, dtype=arg_dt, sycl_queue=q) + + for perms in itertools.permutations(range(4)): + U = dpt.permute_dims(X[::2, ::-1, ::-1, ::5], perms) + expected_Y = np.full(U.shape, fill_value=True, dtype=dpt.bool) + for ord in ["C", "F", "A", "K"]: + Y = dpt.isfinite(U, order=ord) + assert_allclose(dpt.asnumpy(Y), expected_Y) diff --git a/dpnp/tests/tensor/elementwise/test_isinf.py b/dpnp/tests/tensor/elementwise/test_isinf.py new file mode 100644 index 000000000000..91b2e9420446 --- /dev/null +++ b/dpnp/tests/tensor/elementwise/test_isinf.py @@ -0,0 +1,108 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import itertools + +import numpy as np +import pytest +from numpy.testing import assert_allclose + +import dpnp.tensor as dpt + +from ..helper import ( + get_queue_or_skip, + skip_if_dtype_not_supported, +) +from .utils import _all_dtypes + + +@pytest.mark.parametrize("dtype", _all_dtypes) +def test_isinf_out_type(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + X = dpt.asarray(0, dtype=dtype, sycl_queue=q) + assert dpt.isinf(X).dtype == dpt.bool + + +def test_isinf_output(): + q = get_queue_or_skip() + + Xnp = np.asarray(np.inf) + X = dpt.asarray(np.inf, sycl_queue=q) + assert dpt.asnumpy(dpt.isinf(X)) == np.isinf(Xnp) + + +@pytest.mark.parametrize("dtype", ["c8", "c16"]) +def test_isinf_complex(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + y1 = complex(np.inf, np.inf) + y2 = complex(1, np.inf) + y3 = complex(np.inf, 1) + y4 = complex(2, 1) + y5 = complex(np.inf, 1) + y6 = complex(np.inf, np.nan) + + Ynp = np.repeat(np.array([y1, y2, y3, y4, y5, y6], dtype=dtype), 123) + Y = dpt.asarray(Ynp, sycl_queue=q) + assert np.array_equal(dpt.asnumpy(dpt.isinf(Y)), np.isinf(Ynp)) + + +@pytest.mark.parametrize("dtype", ["f2", "f4", "f8"]) +def test_isinf_floats(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + y1 = np.nan + y2 = 1 + y3 = np.inf + y4 = -np.inf + + for mult in [123, 137, 255, 271, 272]: + Ynp = np.repeat(np.array([y1, y2, y3, y4], dtype=dtype), mult) + Y = dpt.asarray(Ynp, sycl_queue=q) + assert np.array_equal(dpt.asnumpy(dpt.isinf(Y)), np.isinf(Ynp)) + + +@pytest.mark.parametrize("dtype", _all_dtypes) +def test_isinf_order(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + arg_dt = np.dtype(dtype) + input_shape = (10, 10, 10, 10) + X = dpt.ones(input_shape, dtype=arg_dt, sycl_queue=q) + + for perms in itertools.permutations(range(4)): + U = dpt.permute_dims(X[::2, ::-1, ::-1, ::5], perms) + expected_Y = np.full(U.shape, fill_value=False, dtype=dpt.bool) + for ord in ["C", "F", "A", "K"]: + Y = dpt.isinf(U, order=ord) + assert_allclose(dpt.asnumpy(Y), expected_Y) diff --git a/dpnp/tests/tensor/elementwise/test_isnan.py b/dpnp/tests/tensor/elementwise/test_isnan.py new file mode 100644 index 000000000000..fe6f2660734a --- /dev/null +++ b/dpnp/tests/tensor/elementwise/test_isnan.py @@ -0,0 +1,113 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import itertools + +import numpy as np +import pytest + +import dpnp.tensor as dpt + +from ..helper import ( + get_queue_or_skip, + skip_if_dtype_not_supported, +) +from .utils import _all_dtypes + + +@pytest.mark.parametrize("dtype", _all_dtypes) +def test_isnan_out_type(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + X = dpt.asarray(0, dtype=dtype, sycl_queue=q) + assert dpt.isnan(X).dtype == dpt.bool + + +def test_isnan_output(): + q = get_queue_or_skip() + + Xnp = np.asarray(np.nan) + X = dpt.asarray(np.nan, sycl_queue=q) + assert dpt.asnumpy(dpt.isnan(X)) == np.isnan(Xnp) + + +@pytest.mark.parametrize("dtype", ["c8", "c16"]) +def test_isnan_complex(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + y1 = complex(np.nan, np.nan) + y2 = complex(1, np.nan) + y3 = complex(np.nan, 1) + y4 = complex(2, 1) + y5 = complex(np.inf, 1) + + Ynp = np.repeat(np.array([y1, y2, y3, y4, y5], dtype=dtype), 123) + Y = dpt.asarray(Ynp, sycl_queue=q) + assert np.array_equal(dpt.asnumpy(dpt.isnan(Y)), np.isnan(Ynp)) + + r = dpt.empty_like(Y, dtype="bool") + dpt.isnan(Y, out=r) + assert np.array_equal(dpt.asnumpy(r)[()], np.isnan(Ynp)) + + +@pytest.mark.parametrize("dtype", ["f2", "f4", "f8"]) +def test_isnan_floats(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + y1 = np.nan + y2 = 1 + y3 = np.inf + + for mult in [123, 137, 255, 271, 272]: + Ynp = np.repeat(np.array([y1, y2, y3], dtype=dtype), mult) + Y = dpt.asarray(Ynp, sycl_queue=q) + assert np.array_equal(dpt.asnumpy(dpt.isnan(Y)), np.isnan(Ynp)) + + r = dpt.empty_like(Y, dtype="bool") + dpt.isnan(Y, out=r) + assert np.array_equal(dpt.asnumpy(r)[()], np.isnan(Ynp)) + + +@pytest.mark.parametrize("dtype", _all_dtypes) +def test_isnan_order(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + arg_dt = np.dtype(dtype) + input_shape = (10, 10, 10, 10) + X = dpt.ones(input_shape, dtype=arg_dt, sycl_queue=q) + + for perms in itertools.permutations(range(4)): + U = dpt.permute_dims(X[::2, ::-1, ::-1, ::5], perms) + expected_Y = np.full(U.shape, fill_value=False, dtype=dpt.bool) + for ord in ["C", "F", "A", "K"]: + Y = dpt.isnan(U, order=ord) + assert np.allclose(dpt.asnumpy(Y), expected_Y) diff --git a/dpnp/tests/tensor/elementwise/test_less.py b/dpnp/tests/tensor/elementwise/test_less.py new file mode 100644 index 000000000000..0abf1e440643 --- /dev/null +++ b/dpnp/tests/tensor/elementwise/test_less.py @@ -0,0 +1,314 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import ctypes + +import dpctl +import numpy as np +import pytest + +import dpnp.tensor as dpt + +from ..helper import ( + get_queue_or_skip, + skip_if_dtype_not_supported, +) +from .utils import ( + _all_dtypes, + _compare_dtypes, + _usm_types, +) + + +@pytest.mark.parametrize("op1_dtype", _all_dtypes) +@pytest.mark.parametrize("op2_dtype", _all_dtypes) +def test_less_dtype_matrix(op1_dtype, op2_dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(op1_dtype, q) + skip_if_dtype_not_supported(op2_dtype, q) + + sz = 127 + ar1 = dpt.zeros(sz, dtype=op1_dtype) + ar2 = dpt.ones_like(ar1, dtype=op2_dtype) + + r = dpt.less(ar1, ar2) + assert isinstance(r, dpt.usm_ndarray) + expected = np.less( + np.zeros(1, dtype=op1_dtype), np.ones(1, dtype=op2_dtype) + ) + assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q) + assert r.shape == ar1.shape + assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all() + assert r.sycl_queue == ar1.sycl_queue + + ar3 = dpt.zeros(sz, dtype=op1_dtype) + ar4 = dpt.ones(2 * sz, dtype=op2_dtype) + + r = dpt.less(ar3[::-1], ar4[::2]) + assert isinstance(r, dpt.usm_ndarray) + expected = np.less( + np.zeros(1, dtype=op1_dtype), np.ones(1, dtype=op2_dtype) + ) + assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q) + assert r.shape == ar3.shape + assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all() + + +@pytest.mark.parametrize("op_dtype", ["c8", "c16"]) +def test_less_complex_matrix(op_dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(op_dtype, q) + + sz = 127 + ar1_np_real = np.random.randint(0, 10, sz) + ar1_np_imag = np.random.randint(0, 10, sz) + ar1_np = ar1_np_real + 1j * ar1_np_imag + ar1 = dpt.asarray(ar1_np, dtype=op_dtype) + + ar2_np_real = np.random.randint(0, 10, sz) + ar2_np_imag = np.random.randint(0, 10, sz) + ar2_np = ar2_np_real + 1j * ar2_np_imag + ar2 = dpt.asarray(ar2_np, dtype=op_dtype) + + r = dpt.less(ar1, ar2) + expected = np.less(ar1_np, ar2_np) + assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q) + assert r.shape == expected.shape + assert (dpt.asnumpy(r) == expected).all() + + r1 = dpt.less(ar1[::-2], ar2[::2]) + expected1 = np.less(ar1_np[::-2], ar2_np[::2]) + assert _compare_dtypes(r.dtype, expected1.dtype, sycl_queue=q) + assert r1.shape == expected1.shape + assert (dpt.asnumpy(r1) == expected1).all() + + ar3 = dpt.asarray([1.0 + 9j, 2.0 + 0j, 2.0 + 1j, 2.0 + 2j], dtype=op_dtype) + ar4 = dpt.asarray([2.0 + 0j, dpt.nan, dpt.inf, -dpt.inf], dtype=op_dtype) + + ar3_np = dpt.asnumpy(ar3) + ar4_np = dpt.asnumpy(ar4) + + r2 = dpt.less(ar3, ar4) + with np.errstate(invalid="ignore"): + expected2 = np.less(ar3_np, ar4_np) + assert (dpt.asnumpy(r2) == expected2).all() + + r3 = dpt.less(ar4, ar4) + with np.errstate(invalid="ignore"): + expected3 = np.less(ar4_np, ar4_np) + assert (dpt.asnumpy(r3) == expected3).all() + + +def test_less_complex_float(): + get_queue_or_skip() + + ar1 = dpt.asarray([1.0 + 9j, 2.0 + 0j, 2.0 + 1j, 2.0 + 2j], dtype="c8") + ar2 = dpt.full((4,), 2, dtype="f4") + + ar1_np = dpt.asnumpy(ar1) + ar2_np = dpt.asnumpy(ar2) + + r = dpt.less(ar1, ar2) + expected = np.less(ar1_np, ar2_np) + assert (dpt.asnumpy(r) == expected).all() + + r1 = dpt.less(ar2, ar1) + expected1 = np.less(ar2_np, ar1_np) + assert (dpt.asnumpy(r1) == expected1).all() + with np.errstate(invalid="ignore"): + for tp in [dpt.nan, dpt.inf, -dpt.inf]: + + ar3 = dpt.full((4,), tp) + ar3_np = dpt.asnumpy(ar3) + + r2 = dpt.less(ar1, ar3) + expected2 = np.less(ar1_np, ar3_np) + assert (dpt.asnumpy(r2) == expected2).all() + + r3 = dpt.less(ar3, ar1) + expected3 = np.less(ar3_np, ar1_np) + assert (dpt.asnumpy(r3) == expected3).all() + + +@pytest.mark.parametrize("op1_usm_type", _usm_types) +@pytest.mark.parametrize("op2_usm_type", _usm_types) +def test_less_usm_type_matrix(op1_usm_type, op2_usm_type): + get_queue_or_skip() + + sz = 128 + ar1 = dpt.ones(sz, dtype="i4", usm_type=op1_usm_type) + ar2 = dpt.ones_like(ar1, dtype="i4", usm_type=op2_usm_type) + + r = dpt.less(ar1, ar2) + assert isinstance(r, dpt.usm_ndarray) + expected_usm_type = dpt.get_coerced_usm_type((op1_usm_type, op2_usm_type)) + assert r.usm_type == expected_usm_type + + +def test_less_order(): + get_queue_or_skip() + + ar1 = dpt.ones((20, 20), dtype="i4", order="C") + ar2 = dpt.ones((20, 20), dtype="i4", order="C") + r1 = dpt.less(ar1, ar2, order="C") + assert r1.flags.c_contiguous + r2 = dpt.less(ar1, ar2, order="F") + assert r2.flags.f_contiguous + r3 = dpt.less(ar1, ar2, order="A") + assert r3.flags.c_contiguous + r4 = dpt.less(ar1, ar2, order="K") + assert r4.flags.c_contiguous + + ar1 = dpt.ones((20, 20), dtype="i4", order="F") + ar2 = dpt.ones((20, 20), dtype="i4", order="F") + r1 = dpt.less(ar1, ar2, order="C") + assert r1.flags.c_contiguous + r2 = dpt.less(ar1, ar2, order="F") + assert r2.flags.f_contiguous + r3 = dpt.less(ar1, ar2, order="A") + assert r3.flags.f_contiguous + r4 = dpt.less(ar1, ar2, order="K") + assert r4.flags.f_contiguous + + ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2] + ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2] + r4 = dpt.less(ar1, ar2, order="K") + assert r4.strides == (20, -1) + + ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT + ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT + r4 = dpt.less(ar1, ar2, order="K") + assert r4.strides == (-1, 20) + + +def test_less_broadcasting(): + get_queue_or_skip() + + m = dpt.ones((100, 5), dtype="i4") + v = dpt.arange(1, 6, dtype="i4") + + r = dpt.less(m, v) + + expected = np.less( + np.ones((100, 5), dtype="i4"), np.arange(1, 6, dtype="i4") + ) + assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all() + + r2 = dpt.less(v, m) + expected2 = np.less( + np.arange(1, 6, dtype="i4"), np.ones((100, 5), dtype="i4") + ) + assert (dpt.asnumpy(r2) == expected2.astype(r2.dtype)).all() + + +@pytest.mark.parametrize("arr_dt", _all_dtypes) +def test_less_python_scalar(arr_dt): + q = get_queue_or_skip() + skip_if_dtype_not_supported(arr_dt, q) + + X = dpt.ones((10, 10), dtype=arr_dt, sycl_queue=q) + py_ones = ( + bool(1), + int(1), + float(1), + complex(1), + np.float32(1), + ctypes.c_int(1), + ) + for sc in py_ones: + R = dpt.less(X, sc) + assert isinstance(R, dpt.usm_ndarray) + R = dpt.less(sc, X) + assert isinstance(R, dpt.usm_ndarray) + + +class MockArray: + def __init__(self, arr): + self.data_ = arr + + @property + def __sycl_usm_array_interface__(self): + return self.data_.__sycl_usm_array_interface__ + + +def test_less_mock_array(): + get_queue_or_skip() + a = dpt.arange(10) + b = dpt.ones(10) + c = MockArray(b) + r = dpt.less(a, c) + assert isinstance(r, dpt.usm_ndarray) + + +def test_less_canary_mock_array(): + get_queue_or_skip() + a = dpt.arange(10) + + class Canary: + def __init__(self): + pass + + @property + def __sycl_usm_array_interface__(self): + return None + + c = Canary() + with pytest.raises(ValueError): + dpt.less(a, c) + + +def test_less_mixed_integer_kinds(): + get_queue_or_skip() + + x1 = dpt.flip(dpt.arange(-9, 1, dtype="i8")) + x2 = dpt.arange(10, dtype="u8") + + # u8 - i8 + assert not dpt.any(dpt.less(x2, x1)) + # i8 - u8 + res = dpt.less(x1, x2) + assert not res[0] + assert dpt.all(res[1:]) + + # Python scalar + assert not dpt.any(dpt.less(x2, -1)) + assert dpt.all(dpt.less(-1, x2)) + + +def test_less_very_large_py_int(): + get_queue_or_skip() + + py_int = dpt.iinfo(dpt.int64).max + 10 + + x = dpt.asarray(3, dtype="u8") + assert not py_int < x + assert dpt.less(x, py_int) + + x = dpt.asarray(py_int, dtype="u8") + assert not x < -1 + assert dpt.less(-1, x) diff --git a/dpnp/tests/tensor/elementwise/test_less_equal.py b/dpnp/tests/tensor/elementwise/test_less_equal.py new file mode 100644 index 000000000000..1a5744475210 --- /dev/null +++ b/dpnp/tests/tensor/elementwise/test_less_equal.py @@ -0,0 +1,313 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import ctypes + +import dpctl +import numpy as np +import pytest + +import dpnp.tensor as dpt + +from ..helper import ( + get_queue_or_skip, + skip_if_dtype_not_supported, +) +from .utils import ( + _all_dtypes, + _compare_dtypes, + _usm_types, +) + + +@pytest.mark.parametrize("op1_dtype", _all_dtypes) +@pytest.mark.parametrize("op2_dtype", _all_dtypes) +def test_less_equal_dtype_matrix(op1_dtype, op2_dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(op1_dtype, q) + skip_if_dtype_not_supported(op2_dtype, q) + + sz = 127 + ar1 = dpt.zeros(sz, dtype=op1_dtype) + ar2 = dpt.ones_like(ar1, dtype=op2_dtype) + + r = dpt.less_equal(ar1, ar2) + assert isinstance(r, dpt.usm_ndarray) + expected = np.less_equal( + np.zeros(1, dtype=op1_dtype), np.ones(1, dtype=op2_dtype) + ) + assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q) + assert r.shape == ar1.shape + assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all() + assert r.sycl_queue == ar1.sycl_queue + + ar3 = dpt.zeros(sz, dtype=op1_dtype) + ar4 = dpt.ones(2 * sz, dtype=op2_dtype) + + r = dpt.less_equal(ar3[::-1], ar4[::2]) + assert isinstance(r, dpt.usm_ndarray) + expected = np.less_equal( + np.zeros(1, dtype=op1_dtype), np.ones(1, dtype=op2_dtype) + ) + assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q) + assert r.shape == ar3.shape + assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all() + + +@pytest.mark.parametrize("op_dtype", ["c8", "c16"]) +def test_less_equal_complex_matrix(op_dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(op_dtype, q) + + sz = 127 + ar1_np_real = np.random.randint(0, 10, sz) + ar1_np_imag = np.random.randint(0, 10, sz) + ar1_np = ar1_np_real + 1j * ar1_np_imag + ar1 = dpt.asarray(ar1_np, dtype=op_dtype) + + ar2_np_real = np.random.randint(0, 10, sz) + ar2_np_imag = np.random.randint(0, 10, sz) + ar2_np = ar2_np_real + 1j * ar2_np_imag + ar2 = dpt.asarray(ar2_np, dtype=op_dtype) + + r = dpt.less_equal(ar1, ar2) + expected = np.less_equal(ar1_np, ar2_np) + assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q) + assert r.shape == expected.shape + assert (dpt.asnumpy(r) == expected).all() + + r1 = dpt.less_equal(ar1[::-2], ar2[::2]) + expected1 = np.less_equal(ar1_np[::-2], ar2_np[::2]) + assert _compare_dtypes(r.dtype, expected1.dtype, sycl_queue=q) + assert r1.shape == expected1.shape + assert (dpt.asnumpy(r1) == expected1).all() + + ar3 = dpt.asarray([1.0 + 9j, 2.0 + 0j, 2.0 + 1j, 2.0 + 2j], dtype=op_dtype) + ar4 = dpt.asarray([2.0 + 0j, dpt.nan, dpt.inf, -dpt.inf], dtype=op_dtype) + + ar3_np = dpt.asnumpy(ar3) + ar4_np = dpt.asnumpy(ar4) + + r2 = dpt.less_equal(ar3, ar4) + with np.errstate(invalid="ignore"): + expected2 = np.less_equal(ar3_np, ar4_np) + assert (dpt.asnumpy(r2) == expected2).all() + + r3 = dpt.less_equal(ar4, ar4) + with np.errstate(invalid="ignore"): + expected3 = np.less_equal(ar4_np, ar4_np) + assert (dpt.asnumpy(r3) == expected3).all() + + +def test_less_equal_complex_float(): + get_queue_or_skip() + + ar1 = dpt.asarray([1.0 + 9j, 2.0 + 0j, 2.0 + 1j, 2.0 + 2j], dtype="c8") + ar2 = dpt.full((4,), 2, dtype="f4") + + ar1_np = dpt.asnumpy(ar1) + ar2_np = dpt.asnumpy(ar2) + + r = dpt.less_equal(ar1, ar2) + expected = np.less_equal(ar1_np, ar2_np) + assert (dpt.asnumpy(r) == expected).all() + + r1 = dpt.less_equal(ar2, ar1) + expected1 = np.less_equal(ar2_np, ar1_np) + assert (dpt.asnumpy(r1) == expected1).all() + with np.errstate(invalid="ignore"): + for tp in [dpt.nan, dpt.inf, -dpt.inf]: + + ar3 = dpt.full((4,), tp) + ar3_np = dpt.asnumpy(ar3) + r2 = dpt.less_equal(ar1, ar3) + expected2 = np.less_equal(ar1_np, ar3_np) + assert (dpt.asnumpy(r2) == expected2).all() + + r3 = dpt.less_equal(ar3, ar1) + expected3 = np.less_equal(ar3_np, ar1_np) + assert (dpt.asnumpy(r3) == expected3).all() + + +@pytest.mark.parametrize("op1_usm_type", _usm_types) +@pytest.mark.parametrize("op2_usm_type", _usm_types) +def test_less_equal_usm_type_matrix(op1_usm_type, op2_usm_type): + get_queue_or_skip() + + sz = 128 + ar1 = dpt.ones(sz, dtype="i4", usm_type=op1_usm_type) + ar2 = dpt.ones_like(ar1, dtype="i4", usm_type=op2_usm_type) + + r = dpt.less_equal(ar1, ar2) + assert isinstance(r, dpt.usm_ndarray) + expected_usm_type = dpt.get_coerced_usm_type((op1_usm_type, op2_usm_type)) + assert r.usm_type == expected_usm_type + + +def test_less_equal_order(): + get_queue_or_skip() + + ar1 = dpt.ones((20, 20), dtype="i4", order="C") + ar2 = dpt.ones((20, 20), dtype="i4", order="C") + r1 = dpt.less_equal(ar1, ar2, order="C") + assert r1.flags.c_contiguous + r2 = dpt.less_equal(ar1, ar2, order="F") + assert r2.flags.f_contiguous + r3 = dpt.less_equal(ar1, ar2, order="A") + assert r3.flags.c_contiguous + r4 = dpt.less_equal(ar1, ar2, order="K") + assert r4.flags.c_contiguous + + ar1 = dpt.ones((20, 20), dtype="i4", order="F") + ar2 = dpt.ones((20, 20), dtype="i4", order="F") + r1 = dpt.less_equal(ar1, ar2, order="C") + assert r1.flags.c_contiguous + r2 = dpt.less_equal(ar1, ar2, order="F") + assert r2.flags.f_contiguous + r3 = dpt.less_equal(ar1, ar2, order="A") + assert r3.flags.f_contiguous + r4 = dpt.less_equal(ar1, ar2, order="K") + assert r4.flags.f_contiguous + + ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2] + ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2] + r4 = dpt.less_equal(ar1, ar2, order="K") + assert r4.strides == (20, -1) + + ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT + ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT + r4 = dpt.less_equal(ar1, ar2, order="K") + assert r4.strides == (-1, 20) + + +def test_less_equal_broadcasting(): + get_queue_or_skip() + + m = dpt.ones((100, 5), dtype="i4") + v = dpt.arange(1, 6, dtype="i4") + + r = dpt.less_equal(m, v) + + expected = np.less_equal( + np.ones((100, 5), dtype="i4"), np.arange(1, 6, dtype="i4") + ) + assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all() + + r2 = dpt.less_equal(v, m) + expected2 = np.less_equal( + np.arange(1, 6, dtype="i4"), np.ones((100, 5), dtype="i4") + ) + assert (dpt.asnumpy(r2) == expected2.astype(r2.dtype)).all() + + +@pytest.mark.parametrize("arr_dt", _all_dtypes) +def test_less_equal_python_scalar(arr_dt): + q = get_queue_or_skip() + skip_if_dtype_not_supported(arr_dt, q) + + X = dpt.ones((10, 10), dtype=arr_dt, sycl_queue=q) + py_ones = ( + bool(1), + int(1), + float(1), + complex(1), + np.float32(1), + ctypes.c_int(1), + ) + for sc in py_ones: + R = dpt.less_equal(X, sc) + assert isinstance(R, dpt.usm_ndarray) + R = dpt.less_equal(sc, X) + assert isinstance(R, dpt.usm_ndarray) + + +class MockArray: + def __init__(self, arr): + self.data_ = arr + + @property + def __sycl_usm_array_interface__(self): + return self.data_.__sycl_usm_array_interface__ + + +def test_less_equal_mock_array(): + get_queue_or_skip() + a = dpt.arange(10) + b = dpt.ones(10) + c = MockArray(b) + r = dpt.less_equal(a, c) + assert isinstance(r, dpt.usm_ndarray) + + +def test_less_equal_canary_mock_array(): + get_queue_or_skip() + a = dpt.arange(10) + + class Canary: + def __init__(self): + pass + + @property + def __sycl_usm_array_interface__(self): + return None + + c = Canary() + with pytest.raises(ValueError): + dpt.less_equal(a, c) + + +def test_less_equal_mixed_integer_kinds(): + get_queue_or_skip() + + x1 = dpt.flip(dpt.arange(-9, 1, dtype="i8")) + x2 = dpt.arange(10, dtype="u8") + + # u8 - i8 + res = dpt.less_equal(x2, x1) + assert res[0] + assert not dpt.any(res[1:]) + # i8 - u8 + assert dpt.all(dpt.less_equal(x1, x2)) + + # Python scalar + assert not dpt.any(dpt.less_equal(x2, -1)) + assert dpt.all(dpt.less_equal(-1, x2)) + + +def test_less_equal_very_large_py_int(): + get_queue_or_skip() + + py_int = dpt.iinfo(dpt.int64).max + 10 + + x = dpt.asarray(3, dtype="u8") + assert not py_int <= x + assert dpt.less_equal(x, py_int) + + x = dpt.asarray(py_int, dtype="u8") + assert not x <= -1 + assert dpt.less_equal(-1, x) diff --git a/dpnp/tests/tensor/elementwise/test_log.py b/dpnp/tests/tensor/elementwise/test_log.py new file mode 100644 index 000000000000..b41fa85df05e --- /dev/null +++ b/dpnp/tests/tensor/elementwise/test_log.py @@ -0,0 +1,149 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import itertools + +import numpy as np +import pytest +from numpy.testing import assert_allclose, assert_equal + +import dpnp.tensor as dpt + +from ..helper import ( + get_queue_or_skip, + skip_if_dtype_not_supported, +) +from .utils import ( + _all_dtypes, + _map_to_device_dtype, + _usm_types, +) + + +@pytest.mark.parametrize("dtype", _all_dtypes) +def test_log_out_type(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + X = dpt.asarray(1, dtype=dtype, sycl_queue=q) + expected_dtype = np.log(np.array(1, dtype=dtype)).dtype + expected_dtype = _map_to_device_dtype(expected_dtype, q.sycl_device) + assert dpt.log(X).dtype == expected_dtype + + +@pytest.mark.parametrize("dtype", ["f2", "f4", "f8", "c8", "c16"]) +def test_log_output_contig(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + n_seq = 1027 + + X = dpt.linspace(1, 13, num=n_seq, dtype=dtype, sycl_queue=q) + Xnp = dpt.asnumpy(X) + + Y = dpt.log(X) + tol = 8 * dpt.finfo(Y.dtype).resolution + + assert_allclose(dpt.asnumpy(Y), np.log(Xnp), atol=tol, rtol=tol) + + +@pytest.mark.parametrize("dtype", ["f2", "f4", "f8", "c8", "c16"]) +def test_log_output_strided(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + n_seq = 2 * 1027 + + X = dpt.linspace(1, 13, num=n_seq, dtype=dtype, sycl_queue=q)[::-2] + Xnp = dpt.asnumpy(X) + + Y = dpt.log(X) + tol = 8 * dpt.finfo(Y.dtype).resolution + + assert_allclose(dpt.asnumpy(Y), np.log(Xnp), atol=tol, rtol=tol) + + +@pytest.mark.parametrize("usm_type", _usm_types) +def test_log_usm_type(usm_type): + q = get_queue_or_skip() + + arg_dt = np.dtype("f4") + input_shape = (10, 10, 10, 10) + X = dpt.empty(input_shape, dtype=arg_dt, usm_type=usm_type, sycl_queue=q) + X[..., 0::2] = 4 * dpt.e + X[..., 1::2] = 10 * dpt.e + + Y = dpt.log(X) + assert Y.usm_type == X.usm_type + assert Y.sycl_queue == X.sycl_queue + assert Y.flags.c_contiguous + + expected_Y = np.empty(input_shape, dtype=arg_dt) + expected_Y[..., 0::2] = np.log(np.float32(4 * dpt.e)) + expected_Y[..., 1::2] = np.log(np.float32(10 * dpt.e)) + tol = 8 * dpt.finfo(Y.dtype).resolution + + assert_allclose(dpt.asnumpy(Y), expected_Y, atol=tol, rtol=tol) + + +@pytest.mark.parametrize("dtype", _all_dtypes) +def test_log_order(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + arg_dt = np.dtype(dtype) + input_shape = (10, 10, 10, 10) + X = dpt.empty(input_shape, dtype=arg_dt, sycl_queue=q) + X[..., 0::2] = 4 * dpt.e + X[..., 1::2] = 10 * dpt.e + + for perms in itertools.permutations(range(4)): + U = dpt.permute_dims(X[:, ::-1, ::-1, :], perms) + expected_Y = np.log(dpt.asnumpy(U)) + for ord in ["C", "F", "A", "K"]: + Y = dpt.log(U, order=ord) + tol = 8 * max( + dpt.finfo(Y.dtype).resolution, + np.finfo(expected_Y.dtype).resolution, + ) + assert_allclose(dpt.asnumpy(Y), expected_Y, atol=tol, rtol=tol) + + +def test_log_special_cases(): + q = get_queue_or_skip() + + X = dpt.asarray( + [dpt.nan, -dpt.inf, -1.0, -0.0, 0.0, dpt.inf], dtype="f4", sycl_queue=q + ) + Y = dpt.log(X) + + expected = np.array( + [np.nan, np.nan, np.nan, -np.inf, -np.inf, np.inf], dtype="f4" + ) + + assert_equal(dpt.asnumpy(Y), expected) diff --git a/dpnp/tests/tensor/elementwise/test_log10.py b/dpnp/tests/tensor/elementwise/test_log10.py new file mode 100644 index 000000000000..02c652293b9d --- /dev/null +++ b/dpnp/tests/tensor/elementwise/test_log10.py @@ -0,0 +1,152 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import itertools + +import numpy as np +import pytest +from numpy.testing import assert_equal + +import dpnp.tensor as dpt + +from ..helper import ( + get_queue_or_skip, + skip_if_dtype_not_supported, +) +from .utils import ( + _all_dtypes, + _map_to_device_dtype, + _usm_types, +) + + +@pytest.mark.parametrize("dtype", _all_dtypes) +def test_log_out_type(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + X = dpt.asarray(1, dtype=dtype, sycl_queue=q) + expected_dtype = np.log10(np.array(1, dtype=dtype)).dtype + expected_dtype = _map_to_device_dtype(expected_dtype, q.sycl_device) + assert dpt.log10(X).dtype == expected_dtype + + +@pytest.mark.parametrize("dtype", ["f2", "f4", "f8", "c8", "c16"]) +def test_log_output_contig(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + n_seq = 1027 + + X = dpt.linspace(1, 13, num=n_seq, dtype=dtype, sycl_queue=q) + Xnp = dpt.asnumpy(X) + + Y = dpt.log10(X) + tol = 8 * dpt.finfo(Y.dtype).resolution + + np.testing.assert_allclose( + dpt.asnumpy(Y), np.log10(Xnp), atol=tol, rtol=tol + ) + + +@pytest.mark.parametrize("dtype", ["f2", "f4", "f8", "c8", "c16"]) +def test_log_output_strided(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + n_seq = 2 * 1027 + + X = dpt.linspace(1, 13, num=n_seq, dtype=dtype, sycl_queue=q)[::-2] + Xnp = dpt.asnumpy(X) + + Y = dpt.log10(X) + tol = 8 * dpt.finfo(Y.dtype).resolution + + np.testing.assert_allclose( + dpt.asnumpy(Y), np.log10(Xnp), atol=tol, rtol=tol + ) + + +@pytest.mark.parametrize("usm_type", _usm_types) +def test_log_usm_type(usm_type): + q = get_queue_or_skip() + + arg_dt = np.dtype("f4") + input_shape = (10, 10, 10, 10) + X = dpt.empty(input_shape, dtype=arg_dt, usm_type=usm_type, sycl_queue=q) + X[..., 0::2] = 4 * dpt.e + X[..., 1::2] = 10 * dpt.e + + Y = dpt.log10(X) + assert Y.usm_type == X.usm_type + assert Y.sycl_queue == X.sycl_queue + assert Y.flags.c_contiguous + + expected_Y = np.empty(input_shape, dtype=arg_dt) + expected_Y[..., 0::2] = np.log10(np.float32(4 * dpt.e)) + expected_Y[..., 1::2] = np.log10(np.float32(10 * dpt.e)) + tol = 8 * dpt.finfo(Y.dtype).resolution + + np.testing.assert_allclose(dpt.asnumpy(Y), expected_Y, atol=tol, rtol=tol) + + +@pytest.mark.parametrize("dtype", _all_dtypes) +def test_log_order(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + arg_dt = np.dtype(dtype) + input_shape = (10, 10, 10, 10) + X = dpt.empty(input_shape, dtype=arg_dt, sycl_queue=q) + X[..., 0::2] = 4 * dpt.e + X[..., 1::2] = 10 * dpt.e + + for perms in itertools.permutations(range(4)): + U = dpt.permute_dims(X[:, ::-1, ::-1, :], perms) + expected_Y = np.log10(dpt.asnumpy(U)) + for ord in ["C", "F", "A", "K"]: + Y = dpt.log10(U, order=ord) + tol = 8 * max( + dpt.finfo(Y.dtype).resolution, + np.finfo(expected_Y.dtype).resolution, + ) + np.testing.assert_allclose( + dpt.asnumpy(Y), expected_Y, atol=tol, rtol=tol + ) + + +def test_log_special_cases(): + q = get_queue_or_skip() + + X = dpt.asarray( + [dpt.nan, -1.0, 0.0, -0.0, dpt.inf, -dpt.inf], dtype="f4", sycl_queue=q + ) + Xnp = dpt.asnumpy(X) + + with np.errstate(invalid="ignore", divide="ignore"): + assert_equal(dpt.asnumpy(dpt.log10(X)), np.log10(Xnp)) diff --git a/dpnp/tests/tensor/elementwise/test_log1p.py b/dpnp/tests/tensor/elementwise/test_log1p.py new file mode 100644 index 000000000000..eb6205650e10 --- /dev/null +++ b/dpnp/tests/tensor/elementwise/test_log1p.py @@ -0,0 +1,188 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import itertools + +import numpy as np +import pytest +from numpy.testing import assert_allclose + +import dpnp.tensor as dpt + +from ..helper import ( + get_queue_or_skip, + skip_if_dtype_not_supported, +) +from .utils import ( + _all_dtypes, + _map_to_device_dtype, + _usm_types, +) + + +@pytest.mark.parametrize("dtype", _all_dtypes) +def test_log1p_out_type(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + X = dpt.asarray(0, dtype=dtype, sycl_queue=q) + expected_dtype = np.log1p(np.array(0, dtype=dtype)).dtype + expected_dtype = _map_to_device_dtype(expected_dtype, q.sycl_device) + assert dpt.log1p(X).dtype == expected_dtype + + +@pytest.mark.parametrize("dtype", ["f2", "f4", "f8", "c8", "c16"]) +def test_log1p_output_contig(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + n_seq = 1027 + + X = dpt.linspace(0, 2, num=n_seq, dtype=dtype, sycl_queue=q) + Xnp = dpt.asnumpy(X) + + Y = dpt.log1p(X) + tol = 8 * dpt.finfo(Y.dtype).resolution + + assert_allclose(dpt.asnumpy(Y), np.log1p(Xnp), atol=tol, rtol=tol) + + +@pytest.mark.parametrize("dtype", ["f2", "f4", "f8", "c8", "c16"]) +def test_log1p_output_strided(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + n_seq = 2 * 1027 + + X = dpt.linspace(0, 2, num=n_seq, dtype=dtype, sycl_queue=q)[::-2] + Xnp = dpt.asnumpy(X) + + Y = dpt.log1p(X) + tol = 8 * dpt.finfo(Y.dtype).resolution + + assert_allclose(dpt.asnumpy(Y), np.log1p(Xnp), atol=tol, rtol=tol) + + +@pytest.mark.parametrize("usm_type", _usm_types) +def test_log1p_usm_type(usm_type): + q = get_queue_or_skip() + + arg_dt = np.dtype("f4") + input_shape = (10, 10, 10, 10) + X = dpt.empty(input_shape, dtype=arg_dt, usm_type=usm_type, sycl_queue=q) + X[..., 0::2] = dpt.e / 1000 + X[..., 1::2] = dpt.e / 100 + + Y = dpt.log1p(X) + assert Y.usm_type == X.usm_type + assert Y.sycl_queue == X.sycl_queue + assert Y.flags.c_contiguous + + expected_Y = np.empty(input_shape, dtype=arg_dt) + expected_Y[..., 0::2] = np.log1p(np.float32(dpt.e / 1000)) + expected_Y[..., 1::2] = np.log1p(np.float32(dpt.e / 100)) + tol = 8 * dpt.finfo(Y.dtype).resolution + + assert_allclose(dpt.asnumpy(Y), expected_Y, atol=tol, rtol=tol) + + +@pytest.mark.parametrize("dtype", _all_dtypes) +def test_log1p_order(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + arg_dt = np.dtype(dtype) + input_shape = (10, 10, 10, 10) + X = dpt.empty(input_shape, dtype=arg_dt, sycl_queue=q) + X[..., 0::2] = dpt.e / 1000 + X[..., 1::2] = dpt.e / 100 + + for perms in itertools.permutations(range(4)): + U = dpt.permute_dims(X[:, ::-1, ::-1, :], perms) + expected_Y = np.log1p(dpt.asnumpy(U)) + for ord in ["C", "F", "A", "K"]: + Y = dpt.log1p(U, order=ord) + tol = 8 * max( + dpt.finfo(Y.dtype).resolution, + np.finfo(expected_Y.dtype).resolution, + ) + assert_allclose(dpt.asnumpy(Y), expected_Y, atol=tol, rtol=tol) + + +def test_log1p_special_cases(): + q = get_queue_or_skip() + + X = dpt.asarray( + [dpt.nan, -2.0, -1.0, -0.0, 0.0, dpt.inf], + dtype="f4", + sycl_queue=q, + ) + res = np.asarray([np.nan, np.nan, -np.inf, -0.0, 0.0, np.inf], dtype="f4") + + tol = dpt.finfo(X.dtype).resolution + with np.errstate(divide="ignore", invalid="ignore"): + assert_allclose(dpt.asnumpy(dpt.log1p(X)), res, atol=tol, rtol=tol) + + # special cases for complex + vals = [ + complex(-1.0, 0.0), + complex(2.0, dpt.inf), + complex(2.0, dpt.nan), + complex(-dpt.inf, 1.0), + complex(dpt.inf, 1.0), + complex(-dpt.inf, dpt.inf), + complex(dpt.inf, dpt.inf), + complex(dpt.inf, dpt.nan), + complex(dpt.nan, 1.0), + complex(dpt.nan, dpt.inf), + complex(dpt.nan, dpt.nan), + ] + X = dpt.asarray(vals, dtype=dpt.complex64) + c_nan = complex(np.nan, np.nan) + res = np.asarray( + [ + complex(-np.inf, 0.0), + complex(np.inf, np.pi / 2), + c_nan, + complex(np.inf, np.pi), + complex(np.inf, 0.0), + complex(np.inf, 3 * np.pi / 4), + complex(np.inf, np.pi / 4), + complex(np.inf, np.nan), + c_nan, + complex(np.inf, np.nan), + c_nan, + ], + dtype=np.complex64, + ) + + tol = dpt.finfo(X.dtype).resolution + with np.errstate(invalid="ignore"): + dpt_res = dpt.asnumpy(dpt.log1p(X)) + assert_allclose(np.real(dpt_res), np.real(res), atol=tol, rtol=tol) + assert_allclose(np.imag(dpt_res), np.imag(res), atol=tol, rtol=tol) diff --git a/dpnp/tests/tensor/elementwise/test_log2.py b/dpnp/tests/tensor/elementwise/test_log2.py new file mode 100644 index 000000000000..7cd2f4615133 --- /dev/null +++ b/dpnp/tests/tensor/elementwise/test_log2.py @@ -0,0 +1,148 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import itertools + +import numpy as np +import pytest +from numpy.testing import assert_equal + +import dpnp.tensor as dpt + +from ..helper import ( + get_queue_or_skip, + skip_if_dtype_not_supported, +) +from .utils import ( + _all_dtypes, + _map_to_device_dtype, + _usm_types, +) + + +@pytest.mark.parametrize("dtype", _all_dtypes) +def test_log_out_type(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + X = dpt.asarray(1, dtype=dtype, sycl_queue=q) + expected_dtype = np.log2(np.array(1, dtype=dtype)).dtype + expected_dtype = _map_to_device_dtype(expected_dtype, q.sycl_device) + assert dpt.log2(X).dtype == expected_dtype + + +@pytest.mark.parametrize("dtype", ["f2", "f4", "f8", "c8", "c16"]) +def test_log_output_contig(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + n_seq = 1027 + + X = dpt.linspace(1, 13, num=n_seq, dtype=dtype, sycl_queue=q) + Xnp = dpt.asnumpy(X) + + Y = dpt.log2(X) + tol = 8 * dpt.finfo(Y.dtype).resolution + + np.testing.assert_allclose(dpt.asnumpy(Y), np.log2(Xnp), atol=tol, rtol=tol) + + +@pytest.mark.parametrize("dtype", ["f2", "f4", "f8", "c8", "c16"]) +def test_log_output_strided(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + n_seq = 2 * 1027 + + X = dpt.linspace(1, 13, num=n_seq, dtype=dtype, sycl_queue=q)[::-2] + Xnp = dpt.asnumpy(X) + + Y = dpt.log2(X) + tol = 8 * dpt.finfo(Y.dtype).resolution + + np.testing.assert_allclose(dpt.asnumpy(Y), np.log2(Xnp), atol=tol, rtol=tol) + + +@pytest.mark.parametrize("usm_type", _usm_types) +def test_log_usm_type(usm_type): + q = get_queue_or_skip() + + arg_dt = np.dtype("f4") + input_shape = (10, 10, 10, 10) + X = dpt.empty(input_shape, dtype=arg_dt, usm_type=usm_type, sycl_queue=q) + X[..., 0::2] = 4 * dpt.e + X[..., 1::2] = 10 * dpt.e + + Y = dpt.log2(X) + assert Y.usm_type == X.usm_type + assert Y.sycl_queue == X.sycl_queue + assert Y.flags.c_contiguous + + expected_Y = np.empty(input_shape, dtype=arg_dt) + expected_Y[..., 0::2] = np.log2(np.float32(4 * dpt.e)) + expected_Y[..., 1::2] = np.log2(np.float32(10 * dpt.e)) + tol = 8 * dpt.finfo(Y.dtype).resolution + + np.testing.assert_allclose(dpt.asnumpy(Y), expected_Y, atol=tol, rtol=tol) + + +@pytest.mark.parametrize("dtype", _all_dtypes) +def test_log_order(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + arg_dt = np.dtype(dtype) + input_shape = (10, 10, 10, 10) + X = dpt.empty(input_shape, dtype=arg_dt, sycl_queue=q) + X[..., 0::2] = 4 * dpt.e + X[..., 1::2] = 10 * dpt.e + + for perms in itertools.permutations(range(4)): + U = dpt.permute_dims(X[:, ::-1, ::-1, :], perms) + expected_Y = np.log2(dpt.asnumpy(U)) + for ord in ["C", "F", "A", "K"]: + Y = dpt.log2(U, order=ord) + tol = 8 * max( + dpt.finfo(Y.dtype).resolution, + np.finfo(expected_Y.dtype).resolution, + ) + np.testing.assert_allclose( + dpt.asnumpy(Y), expected_Y, atol=tol, rtol=tol + ) + + +def test_log_special_cases(): + q = get_queue_or_skip() + + X = dpt.asarray( + [dpt.nan, -1.0, 0.0, -0.0, dpt.inf, -dpt.inf], dtype="f4", sycl_queue=q + ) + Xnp = dpt.asnumpy(X) + + with np.errstate(invalid="ignore", divide="ignore"): + assert_equal(dpt.asnumpy(dpt.log2(X)), np.log2(Xnp)) diff --git a/dpnp/tests/tensor/elementwise/test_logaddexp.py b/dpnp/tests/tensor/elementwise/test_logaddexp.py new file mode 100644 index 000000000000..fc16c1722d98 --- /dev/null +++ b/dpnp/tests/tensor/elementwise/test_logaddexp.py @@ -0,0 +1,211 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import ctypes +import re + +import dpctl +import numpy as np +import pytest +from numpy.testing import assert_allclose + +import dpnp.tensor as dpt + +from ..helper import ( + get_queue_or_skip, + skip_if_dtype_not_supported, +) +from .utils import ( + _compare_dtypes, + _no_complex_dtypes, + _usm_types, +) + + +@pytest.mark.parametrize("op1_dtype", _no_complex_dtypes) +@pytest.mark.parametrize("op2_dtype", _no_complex_dtypes) +def test_logaddexp_dtype_matrix(op1_dtype, op2_dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(op1_dtype, q) + skip_if_dtype_not_supported(op2_dtype, q) + + sz = 127 + ar1 = dpt.ones(sz, dtype=op1_dtype) + ar2 = dpt.ones_like(ar1, dtype=op2_dtype) + + r = dpt.logaddexp(ar1, ar2) + assert isinstance(r, dpt.usm_ndarray) + expected = np.logaddexp(dpt.asnumpy(ar1), dpt.asnumpy(ar2)) + assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q) + assert r.shape == ar1.shape + tol = 8 * max( + np.finfo(r.dtype).resolution, np.finfo(expected.dtype).resolution + ) + assert_allclose( + dpt.asnumpy(r), expected.astype(r.dtype), atol=tol, rtol=tol + ) + assert r.sycl_queue == ar1.sycl_queue + + ar3 = dpt.ones(sz, dtype=op1_dtype) + ar4 = dpt.ones(2 * sz, dtype=op2_dtype) + + r = dpt.logaddexp(ar3[::-1], ar4[::2]) + assert isinstance(r, dpt.usm_ndarray) + expected = np.logaddexp(dpt.asnumpy(ar3)[::-1], dpt.asnumpy(ar4)[::2]) + assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q) + assert r.shape == ar3.shape + assert_allclose( + dpt.asnumpy(r), expected.astype(r.dtype), atol=tol, rtol=tol + ) + + +@pytest.mark.parametrize("op1_usm_type", _usm_types) +@pytest.mark.parametrize("op2_usm_type", _usm_types) +def test_logaddexp_usm_type_matrix(op1_usm_type, op2_usm_type): + get_queue_or_skip() + + sz = 128 + ar1 = dpt.ones(sz, dtype="i4", usm_type=op1_usm_type) + ar2 = dpt.ones_like(ar1, dtype="i4", usm_type=op2_usm_type) + + r = dpt.logaddexp(ar1, ar2) + assert isinstance(r, dpt.usm_ndarray) + expected_usm_type = dpt.get_coerced_usm_type((op1_usm_type, op2_usm_type)) + assert r.usm_type == expected_usm_type + + +def test_logaddexp_order(): + get_queue_or_skip() + + test_shape = ( + 20, + 20, + ) + test_shape2 = tuple(2 * dim for dim in test_shape) + n = test_shape[-1] + + for dt1, dt2 in zip(["i4", "i4", "f4"], ["i4", "f4", "i4"]): + ar1 = dpt.ones(test_shape, dtype=dt1, order="C") + ar2 = dpt.ones(test_shape, dtype=dt2, order="C") + r1 = dpt.logaddexp(ar1, ar2, order="C") + assert r1.flags.c_contiguous + r2 = dpt.logaddexp(ar1, ar2, order="F") + assert r2.flags.f_contiguous + r3 = dpt.logaddexp(ar1, ar2, order="A") + assert r3.flags.c_contiguous + r4 = dpt.logaddexp(ar1, ar2, order="K") + assert r4.flags.c_contiguous + + ar1 = dpt.ones(test_shape, dtype=dt1, order="F") + ar2 = dpt.ones(test_shape, dtype=dt2, order="F") + r1 = dpt.logaddexp(ar1, ar2, order="C") + assert r1.flags.c_contiguous + r2 = dpt.logaddexp(ar1, ar2, order="F") + assert r2.flags.f_contiguous + r3 = dpt.logaddexp(ar1, ar2, order="A") + assert r3.flags.f_contiguous + r4 = dpt.logaddexp(ar1, ar2, order="K") + assert r4.flags.f_contiguous + + ar1 = dpt.ones(test_shape2, dtype=dt1, order="C")[:20, ::-2] + ar2 = dpt.ones(test_shape2, dtype=dt2, order="C")[:20, ::-2] + r4 = dpt.logaddexp(ar1, ar2, order="K") + assert r4.strides == (n, -1) + r5 = dpt.logaddexp(ar1, ar2, order="C") + assert r5.strides == (n, 1) + + ar1 = dpt.ones(test_shape2, dtype=dt1, order="C")[:20, ::-2].mT + ar2 = dpt.ones(test_shape2, dtype=dt2, order="C")[:20, ::-2].mT + r4 = dpt.logaddexp(ar1, ar2, order="K") + assert r4.strides == (-1, n) + r5 = dpt.logaddexp(ar1, ar2, order="C") + assert r5.strides == (n, 1) + + +def test_logaddexp_broadcasting(): + get_queue_or_skip() + + m = dpt.ones((100, 5), dtype="i4") + v = dpt.arange(1, 6, dtype="i4") + + r = dpt.logaddexp(m, v) + + expected = np.logaddexp( + np.ones((100, 5), dtype="i4"), np.arange(1, 6, dtype="i4") + ) + assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all() + + r2 = dpt.logaddexp(v, m) + expected2 = np.logaddexp( + np.arange(1, 6, dtype="i4"), np.ones((100, 5), dtype="i4") + ) + assert (dpt.asnumpy(r2) == expected2.astype(r2.dtype)).all() + + +def test_logaddexp_broadcasting_error(): + get_queue_or_skip() + m = dpt.ones((10, 10), dtype="i4") + v = dpt.ones((3,), dtype="i4") + with pytest.raises(ValueError): + dpt.logaddexp(m, v) + + +@pytest.mark.parametrize("arr_dt", _no_complex_dtypes) +def test_logaddexp_python_scalar(arr_dt): + q = get_queue_or_skip() + skip_if_dtype_not_supported(arr_dt, q) + + X = dpt.zeros((10, 10), dtype=arr_dt, sycl_queue=q) + py_zeros = ( + bool(0), + int(0), + float(0), + np.float32(0), + ctypes.c_int(0), + ) + for sc in py_zeros: + R = dpt.logaddexp(X, sc) + assert isinstance(R, dpt.usm_ndarray) + R = dpt.logaddexp(sc, X) + assert isinstance(R, dpt.usm_ndarray) + + +@pytest.mark.parametrize("dtype", _no_complex_dtypes) +def test_logaddexp_dtype_error( + dtype, +): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + ar1 = dpt.ones(5, dtype=dtype) + ar2 = dpt.ones_like(ar1, dtype="f4") + + y = dpt.zeros_like(ar1, dtype="int8") + with pytest.raises(ValueError) as excinfo: + dpt.logaddexp(ar1, ar2, out=y) + assert re.match("Output array of type.*is needed", str(excinfo.value)) diff --git a/dpnp/tests/tensor/elementwise/test_logical_and.py b/dpnp/tests/tensor/elementwise/test_logical_and.py new file mode 100644 index 000000000000..09f5838265af --- /dev/null +++ b/dpnp/tests/tensor/elementwise/test_logical_and.py @@ -0,0 +1,321 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import ctypes + +import dpctl +import numpy as np +import pytest + +import dpnp.tensor as dpt + +from ..helper import ( + get_queue_or_skip, + skip_if_dtype_not_supported, +) +from .utils import ( + _all_dtypes, + _compare_dtypes, + _usm_types, +) + + +@pytest.mark.parametrize("op1_dtype", _all_dtypes) +@pytest.mark.parametrize("op2_dtype", _all_dtypes) +def test_logical_and_dtype_matrix(op1_dtype, op2_dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(op1_dtype, q) + skip_if_dtype_not_supported(op2_dtype, q) + + sz = 127 + ar1 = dpt.asarray(np.random.randint(0, 2, sz), dtype=op1_dtype) + ar2 = dpt.asarray(np.random.randint(0, 2, sz), dtype=op2_dtype) + + r = dpt.logical_and(ar1, ar2) + assert isinstance(r, dpt.usm_ndarray) + + expected = np.logical_and(dpt.asnumpy(ar1), dpt.asnumpy(ar2)) + assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q) + assert r.shape == ar1.shape + assert (dpt.asnumpy(r) == expected).all() + assert r.sycl_queue == ar1.sycl_queue + + r2 = dpt.empty_like(r, dtype=r.dtype) + dpt.logical_and(ar1, ar2, out=r2) + assert (dpt.asnumpy(r) == dpt.asnumpy(r2)).all() + + ar3 = dpt.zeros(sz, dtype=op1_dtype) + ar4 = dpt.ones(2 * sz, dtype=op2_dtype) + + r = dpt.logical_and(ar3[::-1], ar4[::2]) + assert isinstance(r, dpt.usm_ndarray) + expected = np.logical_and( + np.zeros(1, dtype=op1_dtype), np.ones(1, dtype=op2_dtype) + ) + assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q) + assert r.shape == ar3.shape + assert (dpt.asnumpy(r) == expected).all() + + r2 = dpt.empty_like(r, dtype=r.dtype) + dpt.logical_and(ar3[::-1], ar4[::2], out=r2) + assert (dpt.asnumpy(r) == dpt.asnumpy(r2)).all() + + +@pytest.mark.parametrize("op_dtype", ["c8", "c16"]) +def test_logical_and_complex_matrix(op_dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(op_dtype, q) + + sz = 127 + ar1_np_real = np.random.randint(0, 2, sz) + ar1_np_imag = np.random.randint(0, 2, sz) + ar1_np = ar1_np_real + 1j * ar1_np_imag + ar1 = dpt.asarray(ar1_np, dtype=op_dtype) + + ar2_np_real = np.random.randint(0, 2, sz) + ar2_np_imag = np.random.randint(0, 2, sz) + ar2_np = ar2_np_real + 1j * ar2_np_imag + ar2 = dpt.asarray(ar2_np, dtype=op_dtype) + + r = dpt.logical_and(ar1, ar2) + expected = np.logical_and(ar1_np, ar2_np) + assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q) + assert r.shape == expected.shape + assert (dpt.asnumpy(r) == expected).all() + + r1 = dpt.logical_and(ar1[::-2], ar2[::2]) + expected1 = np.logical_and(ar1_np[::-2], ar2_np[::2]) + assert _compare_dtypes(r.dtype, expected1.dtype, sycl_queue=q) + assert r1.shape == expected1.shape + assert (dpt.asnumpy(r1) == expected1).all() + + ar3 = dpt.asarray( + [ + 2.0 + 0j, + dpt.nan, + dpt.nan * 1j, + dpt.inf, + dpt.inf * 1j, + -dpt.inf, + -dpt.inf * 1j, + ], + dtype=op_dtype, + ) + ar4 = dpt.full(ar3.shape, fill_value=1.0 + 2j, dtype=op_dtype) + + ar3_np = dpt.asnumpy(ar3) + ar4_np = dpt.asnumpy(ar4) + + r2 = dpt.logical_and(ar3, ar4) + with np.errstate(invalid="ignore"): + expected2 = np.logical_and(ar3_np, ar4_np) + assert (dpt.asnumpy(r2) == expected2).all() + + r3 = dpt.logical_and(ar4, ar4) + with np.errstate(invalid="ignore"): + expected3 = np.logical_and(ar4_np, ar4_np) + assert (dpt.asnumpy(r3) == expected3).all() + + +def test_logical_and_complex_float(): + get_queue_or_skip() + + ar1 = dpt.asarray([1j, 1.0 + 9j, 2.0 + 0j, 2.0 + 1j], dtype="c8") + ar2 = dpt.full(ar1.shape, 2, dtype="f4") + + ar1_np = dpt.asnumpy(ar1) + ar2_np = dpt.asnumpy(ar2) + + r = dpt.logical_and(ar1, ar2) + expected = np.logical_and(ar1_np, ar2_np) + assert (dpt.asnumpy(r) == expected).all() + + r1 = dpt.logical_and(ar2, ar1) + expected1 = np.logical_and(ar2_np, ar1_np) + assert (dpt.asnumpy(r1) == expected1).all() + with np.errstate(invalid="ignore"): + for tp in [ + dpt.nan, + dpt.nan * 1j, + dpt.inf, + dpt.inf * 1j, + -dpt.inf, + -dpt.inf * 1j, + ]: + ar3 = dpt.full(ar1.shape, tp) + ar3_np = dpt.asnumpy(ar3) + r2 = dpt.logical_and(ar1, ar3) + expected2 = np.logical_and(ar1_np, ar3_np) + assert (dpt.asnumpy(r2) == expected2).all() + + r3 = dpt.logical_and(ar3, ar1) + expected3 = np.logical_and(ar3_np, ar1_np) + assert (dpt.asnumpy(r3) == expected3).all() + + +@pytest.mark.parametrize("op1_usm_type", _usm_types) +@pytest.mark.parametrize("op2_usm_type", _usm_types) +def test_logical_and_usm_type_matrix(op1_usm_type, op2_usm_type): + get_queue_or_skip() + + sz = 128 + ar1 = dpt.asarray( + np.random.randint(0, 2, sz), dtype="i4", usm_type=op1_usm_type + ) + ar2 = dpt.asarray( + np.random.randint(0, 2, sz), dtype=ar1.dtype, usm_type=op2_usm_type + ) + + r = dpt.logical_and(ar1, ar2) + assert isinstance(r, dpt.usm_ndarray) + expected_usm_type = dpt.get_coerced_usm_type((op1_usm_type, op2_usm_type)) + assert r.usm_type == expected_usm_type + + +def test_logical_and_order(): + get_queue_or_skip() + + ar1 = dpt.ones((20, 20), dtype="i4", order="C") + ar2 = dpt.ones((20, 20), dtype="i4", order="C") + r1 = dpt.logical_and(ar1, ar2, order="C") + assert r1.flags.c_contiguous + r2 = dpt.logical_and(ar1, ar2, order="F") + assert r2.flags.f_contiguous + r3 = dpt.logical_and(ar1, ar2, order="A") + assert r3.flags.c_contiguous + r4 = dpt.logical_and(ar1, ar2, order="K") + assert r4.flags.c_contiguous + + ar1 = dpt.ones((20, 20), dtype="i4", order="F") + ar2 = dpt.ones((20, 20), dtype="i4", order="F") + r1 = dpt.logical_and(ar1, ar2, order="C") + assert r1.flags.c_contiguous + r2 = dpt.logical_and(ar1, ar2, order="F") + assert r2.flags.f_contiguous + r3 = dpt.logical_and(ar1, ar2, order="A") + assert r3.flags.f_contiguous + r4 = dpt.logical_and(ar1, ar2, order="K") + assert r4.flags.f_contiguous + + ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2] + ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2] + r4 = dpt.logical_and(ar1, ar2, order="K") + assert r4.strides == (20, -1) + + ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT + ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT + r4 = dpt.logical_and(ar1, ar2, order="K") + assert r4.strides == (-1, 20) + + +def test_logical_and_broadcasting(): + get_queue_or_skip() + + m = dpt.asarray(np.random.randint(0, 2, (100, 5)), dtype="i4") + v = dpt.arange(1, 6, dtype="i4") + + r = dpt.logical_and(m, v) + + expected = np.logical_and(dpt.asnumpy(m), dpt.asnumpy(v)) + assert (dpt.asnumpy(r) == expected).all() + + r2 = dpt.logical_and(v, m) + expected2 = np.logical_and(dpt.asnumpy(v), dpt.asnumpy(m)) + assert (dpt.asnumpy(r2) == expected2).all() + + r3 = dpt.empty_like(r) + dpt.logical_and(m, v, out=r3) + assert (dpt.asnumpy(r3) == expected).all() + + r4 = dpt.empty_like(r) + dpt.logical_and(v, m, out=r4) + assert (dpt.asnumpy(r4) == expected).all() + + +@pytest.mark.parametrize("arr_dt", _all_dtypes) +@pytest.mark.parametrize("scalar_val", [0, 1]) +def test_logical_and_python_scalar(arr_dt, scalar_val): + q = get_queue_or_skip() + skip_if_dtype_not_supported(arr_dt, q) + + X = dpt.asarray( + np.random.randint(0, 2, (10, 10)), dtype=arr_dt, sycl_queue=q + ) + py_ones = ( + bool(scalar_val), + int(scalar_val), + float(scalar_val), + complex(scalar_val), + np.float32(scalar_val), + ctypes.c_int(scalar_val), + ) + for sc in py_ones: + R = dpt.logical_and(X, sc) + assert isinstance(R, dpt.usm_ndarray) + E = np.logical_and(dpt.asnumpy(X), sc) + assert (dpt.asnumpy(R) == E).all() + + R = dpt.logical_and(sc, X) + assert isinstance(R, dpt.usm_ndarray) + E = np.logical_and(sc, dpt.asnumpy(X)) + assert (dpt.asnumpy(R) == E).all() + + +class MockArray: + def __init__(self, arr): + self.data_ = arr + + @property + def __sycl_usm_array_interface__(self): + return self.data_.__sycl_usm_array_interface__ + + +def test_logical_and_mock_array(): + get_queue_or_skip() + a = dpt.arange(10) + b = dpt.ones(10) + c = MockArray(b) + r = dpt.logical_and(a, c) + assert isinstance(r, dpt.usm_ndarray) + + +def test_logical_and_canary_mock_array(): + get_queue_or_skip() + a = dpt.arange(10) + + class Canary: + def __init__(self): + pass + + @property + def __sycl_usm_array_interface__(self): + return None + + c = Canary() + with pytest.raises(ValueError): + dpt.logical_and(a, c) diff --git a/dpnp/tests/tensor/elementwise/test_logical_not.py b/dpnp/tests/tensor/elementwise/test_logical_not.py new file mode 100644 index 000000000000..fa1d5e786bd3 --- /dev/null +++ b/dpnp/tests/tensor/elementwise/test_logical_not.py @@ -0,0 +1,198 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import numpy as np +import pytest + +import dpnp.tensor as dpt + +from ..helper import ( + get_queue_or_skip, + skip_if_dtype_not_supported, +) +from .utils import ( + _all_dtypes, + _compare_dtypes, + _usm_types, +) + + +@pytest.mark.parametrize("op_dtype", _all_dtypes) +def test_logical_not_dtype_matrix(op_dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(op_dtype, q) + + sz = 7 + ar1_np = np.random.randint(0, 2, sz) + ar1 = dpt.asarray(ar1_np, dtype=op_dtype) + + r = dpt.logical_not(ar1) + assert isinstance(r, dpt.usm_ndarray) + + expected = np.logical_not(ar1_np) + assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q) + assert r.shape == ar1.shape + assert (dpt.asnumpy(r) == expected).all() + assert r.sycl_queue == ar1.sycl_queue + + r2 = dpt.empty_like(r, dtype=r.dtype) + dpt.logical_not(ar1, out=r2) + assert (dpt.asnumpy(r) == dpt.asnumpy(r2)).all() + + ar2 = dpt.zeros(sz, dtype=op_dtype) + r = dpt.logical_not(ar2[::-1]) + assert isinstance(r, dpt.usm_ndarray) + + expected = np.logical_not(np.zeros(ar2.shape, dtype=op_dtype)) + assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q) + assert r.shape == ar2.shape + assert (dpt.asnumpy(r) == expected).all() + + ar3 = dpt.ones(sz, dtype=op_dtype) + r2 = dpt.logical_not(ar3[::2]) + assert isinstance(r, dpt.usm_ndarray) + + expected = np.logical_not(np.ones(ar3.shape, dtype=op_dtype)[::2]) + assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q) + assert (dpt.asnumpy(r2) == expected).all() + + r3 = dpt.empty_like(r, dtype=r.dtype) + dpt.logical_not(ar2[::-1], out=r3) + assert (dpt.asnumpy(r) == dpt.asnumpy(r3)).all() + + +@pytest.mark.parametrize("op_dtype", ["c8", "c16"]) +def test_logical_not_complex_matrix(op_dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(op_dtype, q) + + sz = 127 + ar1_np_real = np.random.randint(0, 2, sz) + ar1_np_imag = np.random.randint(0, 2, sz) + ar1_np = ar1_np_real + 1j * ar1_np_imag + ar1 = dpt.asarray(ar1_np, dtype=op_dtype) + + r = dpt.logical_not(ar1) + expected = np.logical_not(ar1_np) + assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q) + assert r.shape == expected.shape + assert (dpt.asnumpy(r) == expected).all() + + r1 = dpt.logical_not(ar1[::-2]) + expected1 = np.logical_not(ar1_np[::-2]) + assert _compare_dtypes(r.dtype, expected1.dtype, sycl_queue=q) + assert r1.shape == expected1.shape + assert (dpt.asnumpy(r1) == expected1).all() + + ar2 = dpt.asarray( + [ + 2.0 + 0j, + dpt.nan, + dpt.nan * 1j, + dpt.inf, + dpt.inf * 1j, + -dpt.inf, + -dpt.inf * 1j, + ], + dtype=op_dtype, + ) + ar2_np = dpt.asnumpy(ar2) + r2 = dpt.logical_not(ar2) + with np.errstate(invalid="ignore"): + expected2 = np.logical_not(ar2_np) + assert (dpt.asnumpy(r2) == expected2).all() + + +def test_logical_not_complex_float(): + get_queue_or_skip() + + ar1 = dpt.asarray([1j, 1.0 + 9j, 2.0 + 0j, 2.0 + 1j], dtype="c8") + + r = dpt.logical_not(ar1) + expected = np.logical_not(dpt.asnumpy(ar1)) + assert (dpt.asnumpy(r) == expected).all() + + with np.errstate(invalid="ignore"): + for tp in [ + dpt.nan, + dpt.nan * 1j, + dpt.inf, + dpt.inf * 1j, + -dpt.inf, + -dpt.inf * 1j, + ]: + ar2 = dpt.full(ar1.shape, tp) + r2 = dpt.logical_not(ar2) + expected2 = np.logical_not(dpt.asnumpy(ar2)) + assert (dpt.asnumpy(r2) == expected2).all() + + +@pytest.mark.parametrize("op_usm_type", _usm_types) +def test_logical_not_usm_type_matrix(op_usm_type): + get_queue_or_skip() + + sz = 128 + ar1 = dpt.asarray( + np.random.randint(0, 2, sz), dtype="i4", usm_type=op_usm_type + ) + + r = dpt.logical_not(ar1) + assert isinstance(r, dpt.usm_ndarray) + assert r.usm_type == op_usm_type + + +def test_logical_not_order(): + get_queue_or_skip() + + ar1 = dpt.ones((20, 20), dtype="i4", order="C") + r1 = dpt.logical_not(ar1, order="C") + assert r1.flags.c_contiguous + r2 = dpt.logical_not(ar1, order="F") + assert r2.flags.f_contiguous + r3 = dpt.logical_not(ar1, order="A") + assert r3.flags.c_contiguous + r4 = dpt.logical_not(ar1, order="K") + assert r4.flags.c_contiguous + + ar1 = dpt.zeros((20, 20), dtype="i4", order="F") + r1 = dpt.logical_not(ar1, order="C") + assert r1.flags.c_contiguous + r2 = dpt.logical_not(ar1, order="F") + assert r2.flags.f_contiguous + r3 = dpt.logical_not(ar1, order="A") + assert r3.flags.f_contiguous + r4 = dpt.logical_not(ar1, order="K") + assert r4.flags.f_contiguous + + ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2] + r4 = dpt.logical_not(ar1, order="K") + assert r4.strides == (20, -1) + + ar1 = dpt.zeros((40, 40), dtype="i4", order="C")[:20, ::-2].mT + r4 = dpt.logical_not(ar1, order="K") + assert r4.strides == (-1, 20) diff --git a/dpnp/tests/tensor/elementwise/test_logical_or.py b/dpnp/tests/tensor/elementwise/test_logical_or.py new file mode 100644 index 000000000000..42c7e6f645b3 --- /dev/null +++ b/dpnp/tests/tensor/elementwise/test_logical_or.py @@ -0,0 +1,322 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import ctypes + +import dpctl +import numpy as np +import pytest + +import dpnp.tensor as dpt + +from ..helper import ( + get_queue_or_skip, + skip_if_dtype_not_supported, +) +from .utils import ( + _all_dtypes, + _compare_dtypes, + _usm_types, +) + + +@pytest.mark.parametrize("op1_dtype", _all_dtypes) +@pytest.mark.parametrize("op2_dtype", _all_dtypes) +def test_logical_or_dtype_matrix(op1_dtype, op2_dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(op1_dtype, q) + skip_if_dtype_not_supported(op2_dtype, q) + + sz = 127 + ar1 = dpt.asarray(np.random.randint(0, 2, sz), dtype=op1_dtype) + ar2 = dpt.asarray(np.random.randint(0, 2, sz), dtype=op2_dtype) + + r = dpt.logical_or(ar1, ar2) + assert isinstance(r, dpt.usm_ndarray) + + expected = np.logical_or(dpt.asnumpy(ar1), dpt.asnumpy(ar2)) + assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q) + assert r.shape == ar1.shape + assert (dpt.asnumpy(r) == expected).all() + assert r.sycl_queue == ar1.sycl_queue + + r2 = dpt.empty_like(r, dtype=r.dtype) + dpt.logical_or(ar1, ar2, out=r2) + assert (dpt.asnumpy(r) == dpt.asnumpy(r2)).all() + + ar3 = dpt.zeros(sz, dtype=op1_dtype) + ar4 = dpt.ones(2 * sz, dtype=op2_dtype) + + r = dpt.logical_or(ar3[::-1], ar4[::2]) + assert isinstance(r, dpt.usm_ndarray) + expected = np.logical_or( + np.zeros(1, dtype=op1_dtype), np.ones(1, dtype=op2_dtype) + ) + assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q) + assert r.shape == ar3.shape + assert (dpt.asnumpy(r) == expected).all() + + r2 = dpt.empty_like(r, dtype=r.dtype) + dpt.logical_or(ar3[::-1], ar4[::2], out=r2) + assert (dpt.asnumpy(r) == dpt.asnumpy(r2)).all() + + +@pytest.mark.parametrize("op_dtype", ["c8", "c16"]) +def test_logical_or_complex_matrix(op_dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(op_dtype, q) + + sz = 127 + ar1_np_real = np.random.randint(0, 2, sz) + ar1_np_imag = np.random.randint(0, 2, sz) + ar1_np = ar1_np_real + 1j * ar1_np_imag + ar1 = dpt.asarray(ar1_np, dtype=op_dtype) + + ar2_np_real = np.random.randint(0, 2, sz) + ar2_np_imag = np.random.randint(0, 2, sz) + ar2_np = ar2_np_real + 1j * ar2_np_imag + ar2 = dpt.asarray(ar2_np, dtype=op_dtype) + + r = dpt.logical_or(ar1, ar2) + expected = np.logical_or(ar1_np, ar2_np) + assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q) + assert r.shape == expected.shape + assert (dpt.asnumpy(r) == expected).all() + + r1 = dpt.logical_or(ar1[::-2], ar2[::2]) + expected1 = np.logical_or(ar1_np[::-2], ar2_np[::2]) + assert _compare_dtypes(r.dtype, expected1.dtype, sycl_queue=q) + assert r1.shape == expected1.shape + assert (dpt.asnumpy(r1) == expected1).all() + + ar3 = dpt.asarray( + [ + 2.0 + 0j, + dpt.nan, + dpt.nan * 1j, + dpt.inf, + dpt.inf * 1j, + -dpt.inf, + -dpt.inf * 1j, + ], + dtype=op_dtype, + ) + ar4 = dpt.full(ar3.shape, fill_value=1.0 + 2j, dtype=op_dtype) + + ar3_np = dpt.asnumpy(ar3) + ar4_np = dpt.asnumpy(ar4) + + r2 = dpt.logical_or(ar3, ar4) + with np.errstate(invalid="ignore"): + expected2 = np.logical_or(ar3_np, ar4_np) + assert (dpt.asnumpy(r2) == expected2).all() + + r3 = dpt.logical_or(ar4, ar4) + with np.errstate(invalid="ignore"): + expected3 = np.logical_or(ar4_np, ar4_np) + assert (dpt.asnumpy(r3) == expected3).all() + + +def test_logical_or_complex_float(): + get_queue_or_skip() + + ar1 = dpt.asarray([1j, 1.0 + 9j, 2.0 + 0j, 2.0 + 1j], dtype="c8") + ar2 = dpt.full(ar1.shape, 2, dtype="f4") + + ar1_np = dpt.asnumpy(ar1) + ar2_np = dpt.asnumpy(ar2) + + r = dpt.logical_or(ar1, ar2) + expected = np.logical_or(ar1_np, ar2_np) + assert (dpt.asnumpy(r) == expected).all() + + r1 = dpt.logical_or(ar2, ar1) + expected1 = np.logical_or(ar2_np, ar1_np) + assert (dpt.asnumpy(r1) == expected1).all() + with np.errstate(invalid="ignore"): + for tp in [ + dpt.nan, + dpt.nan * 1j, + dpt.inf, + dpt.inf * 1j, + -dpt.inf, + -dpt.inf * 1j, + ]: + ar3 = dpt.full(ar1.shape, tp) + ar3_np = dpt.asnumpy(ar3) + + r2 = dpt.logical_or(ar1, ar3) + expected2 = np.logical_or(ar1_np, ar3_np) + assert (dpt.asnumpy(r2) == expected2).all() + + r3 = dpt.logical_or(ar3, ar1) + expected3 = np.logical_or(ar3_np, ar1_np) + assert (dpt.asnumpy(r3) == expected3).all() + + +@pytest.mark.parametrize("op1_usm_type", _usm_types) +@pytest.mark.parametrize("op2_usm_type", _usm_types) +def test_logical_or_usm_type_matrix(op1_usm_type, op2_usm_type): + get_queue_or_skip() + + sz = 128 + ar1 = dpt.asarray( + np.random.randint(0, 2, sz), dtype="i4", usm_type=op1_usm_type + ) + ar2 = dpt.asarray( + np.random.randint(0, 2, sz), dtype=ar1.dtype, usm_type=op2_usm_type + ) + + r = dpt.logical_or(ar1, ar2) + assert isinstance(r, dpt.usm_ndarray) + expected_usm_type = dpt.get_coerced_usm_type((op1_usm_type, op2_usm_type)) + assert r.usm_type == expected_usm_type + + +def test_logical_or_order(): + get_queue_or_skip() + + ar1 = dpt.ones((20, 20), dtype="i4", order="C") + ar2 = dpt.ones((20, 20), dtype="i4", order="C") + r1 = dpt.logical_or(ar1, ar2, order="C") + assert r1.flags.c_contiguous + r2 = dpt.logical_or(ar1, ar2, order="F") + assert r2.flags.f_contiguous + r3 = dpt.logical_or(ar1, ar2, order="A") + assert r3.flags.c_contiguous + r4 = dpt.logical_or(ar1, ar2, order="K") + assert r4.flags.c_contiguous + + ar1 = dpt.ones((20, 20), dtype="i4", order="F") + ar2 = dpt.ones((20, 20), dtype="i4", order="F") + r1 = dpt.logical_or(ar1, ar2, order="C") + assert r1.flags.c_contiguous + r2 = dpt.logical_or(ar1, ar2, order="F") + assert r2.flags.f_contiguous + r3 = dpt.logical_or(ar1, ar2, order="A") + assert r3.flags.f_contiguous + r4 = dpt.logical_or(ar1, ar2, order="K") + assert r4.flags.f_contiguous + + ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2] + ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2] + r4 = dpt.logical_or(ar1, ar2, order="K") + assert r4.strides == (20, -1) + + ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT + ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT + r4 = dpt.logical_or(ar1, ar2, order="K") + assert r4.strides == (-1, 20) + + +def test_logical_or_broadcasting(): + get_queue_or_skip() + + m = dpt.asarray(np.random.randint(0, 2, (100, 5)), dtype="i4") + v = dpt.arange(1, 6, dtype="i4") + + r = dpt.logical_or(m, v) + + expected = np.logical_or(dpt.asnumpy(m), dpt.asnumpy(v)) + assert (dpt.asnumpy(r) == expected).all() + + r2 = dpt.logical_or(v, m) + expected2 = np.logical_or(dpt.asnumpy(v), dpt.asnumpy(m)) + assert (dpt.asnumpy(r2) == expected2).all() + + r3 = dpt.empty_like(r) + dpt.logical_or(m, v, out=r3) + assert (dpt.asnumpy(r3) == expected).all() + + r4 = dpt.empty_like(r) + dpt.logical_or(v, m, out=r4) + assert (dpt.asnumpy(r4) == expected).all() + + +@pytest.mark.parametrize("arr_dt", _all_dtypes) +@pytest.mark.parametrize("scalar_val", [0, 1]) +def test_logical_or_python_scalar(arr_dt, scalar_val): + q = get_queue_or_skip() + skip_if_dtype_not_supported(arr_dt, q) + + X = dpt.asarray( + np.random.randint(0, 2, (10, 10)), dtype=arr_dt, sycl_queue=q + ) + py_ones = ( + bool(scalar_val), + int(scalar_val), + float(scalar_val), + complex(scalar_val), + np.float32(scalar_val), + ctypes.c_int(scalar_val), + ) + for sc in py_ones: + R = dpt.logical_or(X, sc) + assert isinstance(R, dpt.usm_ndarray) + E = np.logical_or(dpt.asnumpy(X), sc) + assert (dpt.asnumpy(R) == E).all() + + R = dpt.logical_or(sc, X) + assert isinstance(R, dpt.usm_ndarray) + E = np.logical_or(sc, dpt.asnumpy(X)) + assert (dpt.asnumpy(R) == E).all() + + +class MockArray: + def __init__(self, arr): + self.data_ = arr + + @property + def __sycl_usm_array_interface__(self): + return self.data_.__sycl_usm_array_interface__ + + +def test_logical_or_mock_array(): + get_queue_or_skip() + a = dpt.arange(10) + b = dpt.ones(10) + c = MockArray(b) + r = dpt.logical_or(a, c) + assert isinstance(r, dpt.usm_ndarray) + + +def test_logical_or_canary_mock_array(): + get_queue_or_skip() + a = dpt.arange(10) + + class Canary: + def __init__(self): + pass + + @property + def __sycl_usm_array_interface__(self): + return None + + c = Canary() + with pytest.raises(ValueError): + dpt.logical_or(a, c) diff --git a/dpnp/tests/tensor/elementwise/test_logical_xor.py b/dpnp/tests/tensor/elementwise/test_logical_xor.py new file mode 100644 index 000000000000..da2b79974f12 --- /dev/null +++ b/dpnp/tests/tensor/elementwise/test_logical_xor.py @@ -0,0 +1,323 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import ctypes + +import dpctl +import numpy as np +import pytest + +import dpnp.tensor as dpt + +from ..helper import ( + get_queue_or_skip, + skip_if_dtype_not_supported, +) +from .utils import ( + _all_dtypes, + _compare_dtypes, + _usm_types, +) + + +@pytest.mark.parametrize("op1_dtype", _all_dtypes) +@pytest.mark.parametrize("op2_dtype", _all_dtypes) +def test_logical_xor_dtype_matrix(op1_dtype, op2_dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(op1_dtype, q) + skip_if_dtype_not_supported(op2_dtype, q) + + sz = 127 + ar1_np = np.random.randint(0, 2, sz) + ar1 = dpt.asarray(ar1_np, dtype=op1_dtype) + ar2_np = np.random.randint(0, 2, sz) + ar2 = dpt.asarray(ar2_np, dtype=op2_dtype) + + r = dpt.logical_xor(ar1, ar2) + assert isinstance(r, dpt.usm_ndarray) + + expected = np.logical_xor(ar1_np, ar2_np) + assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q) + assert r.shape == ar1.shape + assert (dpt.asnumpy(r) == expected).all() + assert r.sycl_queue == ar1.sycl_queue + + r2 = dpt.empty_like(r, dtype=r.dtype) + dpt.logical_xor(ar1, ar2, out=r2) + assert (dpt.asnumpy(r) == dpt.asnumpy(r2)).all() + + ar3 = dpt.zeros(sz, dtype=op1_dtype) + ar4 = dpt.ones(2 * sz, dtype=op2_dtype) + + r = dpt.logical_xor(ar3[::-1], ar4[::2]) + assert isinstance(r, dpt.usm_ndarray) + expected = np.logical_xor( + np.zeros(1, dtype=op1_dtype), np.ones(1, dtype=op2_dtype) + ) + assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q) + assert r.shape == ar3.shape + assert (dpt.asnumpy(r) == expected).all() + + r2 = dpt.empty_like(r, dtype=r.dtype) + dpt.logical_xor(ar3[::-1], ar4[::2], out=r2) + assert (dpt.asnumpy(r) == dpt.asnumpy(r2)).all() + + +@pytest.mark.parametrize("op_dtype", ["c8", "c16"]) +def test_logical_xor_complex_matrix(op_dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(op_dtype, q) + + sz = 127 + ar1_np_real = np.random.randint(0, 2, sz) + ar1_np_imag = np.random.randint(0, 2, sz) + ar1_np = ar1_np_real + 1j * ar1_np_imag + ar1 = dpt.asarray(ar1_np, dtype=op_dtype) + + ar2_np_real = np.random.randint(0, 2, sz) + ar2_np_imag = np.random.randint(0, 2, sz) + ar2_np = ar2_np_real + 1j * ar2_np_imag + ar2 = dpt.asarray(ar2_np, dtype=op_dtype) + + r = dpt.logical_xor(ar1, ar2) + expected = np.logical_xor(ar1_np, ar2_np) + assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q) + assert r.shape == expected.shape + assert (dpt.asnumpy(r) == expected).all() + + r1 = dpt.logical_xor(ar1[::-2], ar2[::2]) + expected1 = np.logical_xor(ar1_np[::-2], ar2_np[::2]) + assert _compare_dtypes(r.dtype, expected1.dtype, sycl_queue=q) + assert r1.shape == expected1.shape + assert (dpt.asnumpy(r1) == expected1).all() + + ar3 = dpt.asarray( + [ + 2.0 + 0j, + dpt.nan, + dpt.nan * 1j, + dpt.inf, + dpt.inf * 1j, + -dpt.inf, + -dpt.inf * 1j, + ], + dtype=op_dtype, + ) + ar4 = dpt.full(ar3.shape, fill_value=1.0 + 2j, dtype=op_dtype) + + ar3_np = dpt.asnumpy(ar3) + ar4_np = dpt.asnumpy(ar4) + + r2 = dpt.logical_xor(ar3, ar4) + with np.errstate(invalid="ignore"): + expected2 = np.logical_xor(ar3_np, ar4_np) + assert (dpt.asnumpy(r2) == expected2).all() + + r3 = dpt.logical_xor(ar4, ar4) + with np.errstate(invalid="ignore"): + expected3 = np.logical_xor(ar4_np, ar4_np) + assert (dpt.asnumpy(r3) == expected3).all() + + +def test_logical_xor_complex_float(): + get_queue_or_skip() + + ar1 = dpt.asarray([1j, 1.0 + 9j, 2.0 + 0j, 2.0 + 1j], dtype="c8") + ar2 = dpt.full(ar1.shape, 2, dtype="f4") + + ar1_np = dpt.asnumpy(ar1) + ar2_np = dpt.asnumpy(ar1) + + r = dpt.logical_xor(ar1, ar2) + expected = np.logical_xor(ar1_np, ar2_np) + assert (dpt.asnumpy(r) == expected).all() + + r1 = dpt.logical_xor(ar2, ar1) + expected1 = np.logical_xor(ar2_np, ar1_np) + assert (dpt.asnumpy(r1) == expected1).all() + with np.errstate(invalid="ignore"): + for tp in [ + dpt.nan, + dpt.nan * 1j, + dpt.inf, + dpt.inf * 1j, + -dpt.inf, + -dpt.inf * 1j, + ]: + ar3 = dpt.full(ar1.shape, tp) + ar3_np = dpt.asnumpy(ar3) + r2 = dpt.logical_xor(ar1, ar3) + expected2 = np.logical_xor(ar1_np, ar3_np) + assert (dpt.asnumpy(r2) == expected2).all() + + r3 = dpt.logical_xor(ar3, ar1) + expected3 = np.logical_xor(ar3_np, ar1_np) + assert (dpt.asnumpy(r3) == expected3).all() + + +@pytest.mark.parametrize("op1_usm_type", _usm_types) +@pytest.mark.parametrize("op2_usm_type", _usm_types) +def test_logical_xor_usm_type_matrix(op1_usm_type, op2_usm_type): + get_queue_or_skip() + + sz = 128 + ar1 = dpt.asarray( + np.random.randint(0, 2, sz), dtype="i4", usm_type=op1_usm_type + ) + ar2 = dpt.asarray( + np.random.randint(0, 2, sz), dtype=ar1.dtype, usm_type=op2_usm_type + ) + + r = dpt.logical_xor(ar1, ar2) + assert isinstance(r, dpt.usm_ndarray) + expected_usm_type = dpt.get_coerced_usm_type((op1_usm_type, op2_usm_type)) + assert r.usm_type == expected_usm_type + + +def test_logical_xor_order(): + get_queue_or_skip() + + ar1 = dpt.ones((20, 20), dtype="i4", order="C") + ar2 = dpt.ones((20, 20), dtype="i4", order="C") + r1 = dpt.logical_xor(ar1, ar2, order="C") + assert r1.flags.c_contiguous + r2 = dpt.logical_xor(ar1, ar2, order="F") + assert r2.flags.f_contiguous + r3 = dpt.logical_xor(ar1, ar2, order="A") + assert r3.flags.c_contiguous + r4 = dpt.logical_xor(ar1, ar2, order="K") + assert r4.flags.c_contiguous + + ar1 = dpt.ones((20, 20), dtype="i4", order="F") + ar2 = dpt.ones((20, 20), dtype="i4", order="F") + r1 = dpt.logical_xor(ar1, ar2, order="C") + assert r1.flags.c_contiguous + r2 = dpt.logical_xor(ar1, ar2, order="F") + assert r2.flags.f_contiguous + r3 = dpt.logical_xor(ar1, ar2, order="A") + assert r3.flags.f_contiguous + r4 = dpt.logical_xor(ar1, ar2, order="K") + assert r4.flags.f_contiguous + + ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2] + ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2] + r4 = dpt.logical_xor(ar1, ar2, order="K") + assert r4.strides == (20, -1) + + ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT + ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT + r4 = dpt.logical_xor(ar1, ar2, order="K") + assert r4.strides == (-1, 20) + + +def test_logical_xor_broadcasting(): + get_queue_or_skip() + + m = dpt.asarray(np.random.randint(0, 2, (100, 5)), dtype="i4") + v = dpt.arange(1, 6, dtype="i4") + + r = dpt.logical_xor(m, v) + + expected = np.logical_xor(dpt.asnumpy(m), dpt.asnumpy(v)) + assert (dpt.asnumpy(r) == expected).all() + + r2 = dpt.logical_xor(v, m) + expected2 = np.logical_xor(dpt.asnumpy(v), dpt.asnumpy(m)) + assert (dpt.asnumpy(r2) == expected2).all() + + r3 = dpt.empty_like(r) + dpt.logical_xor(m, v, out=r3) + assert (dpt.asnumpy(r3) == expected).all() + + r4 = dpt.empty_like(r) + dpt.logical_xor(v, m, out=r4) + assert (dpt.asnumpy(r4) == expected).all() + + +@pytest.mark.parametrize("arr_dt", _all_dtypes) +@pytest.mark.parametrize("scalar_val", [0, 1]) +def test_logical_xor_python_scalar(arr_dt, scalar_val): + q = get_queue_or_skip() + skip_if_dtype_not_supported(arr_dt, q) + + X = dpt.asarray( + np.random.randint(0, 2, (10, 10)), dtype=arr_dt, sycl_queue=q + ) + py_ones = ( + bool(scalar_val), + int(scalar_val), + float(scalar_val), + complex(scalar_val), + np.float32(scalar_val), + ctypes.c_int(scalar_val), + ) + for sc in py_ones: + R = dpt.logical_xor(X, sc) + assert isinstance(R, dpt.usm_ndarray) + E = np.logical_xor(dpt.asnumpy(X), sc) + assert (dpt.asnumpy(R) == E).all() + + R = dpt.logical_xor(sc, X) + assert isinstance(R, dpt.usm_ndarray) + E = np.logical_xor(sc, dpt.asnumpy(X)) + assert (dpt.asnumpy(R) == E).all() + + +class MockArray: + def __init__(self, arr): + self.data_ = arr + + @property + def __sycl_usm_array_interface__(self): + return self.data_.__sycl_usm_array_interface__ + + +def test_logical_xor_mock_array(): + get_queue_or_skip() + a = dpt.arange(10) + b = dpt.ones(10) + c = MockArray(b) + r = dpt.logical_xor(a, c) + assert isinstance(r, dpt.usm_ndarray) + + +def test_logical_xor_canary_mock_array(): + get_queue_or_skip() + a = dpt.arange(10) + + class Canary: + def __init__(self): + pass + + @property + def __sycl_usm_array_interface__(self): + return None + + c = Canary() + with pytest.raises(ValueError): + dpt.logical_xor(a, c) diff --git a/dpnp/tests/tensor/elementwise/test_maximum_minimum.py b/dpnp/tests/tensor/elementwise/test_maximum_minimum.py new file mode 100644 index 000000000000..2eb6d9de7582 --- /dev/null +++ b/dpnp/tests/tensor/elementwise/test_maximum_minimum.py @@ -0,0 +1,329 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import ctypes +import itertools + +import dpctl +import numpy as np +import pytest +from numpy.testing import assert_array_equal + +import dpnp.tensor as dpt + +from ..helper import ( + get_queue_or_skip, + skip_if_dtype_not_supported, +) +from .utils import ( + _all_dtypes, + _compare_dtypes, + _usm_types, +) + + +@pytest.mark.parametrize("op1_dtype", _all_dtypes) +@pytest.mark.parametrize("op2_dtype", _all_dtypes) +def test_maximum_minimum_dtype_matrix(op1_dtype, op2_dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(op1_dtype, q) + skip_if_dtype_not_supported(op2_dtype, q) + + sz = 127 + ar1_np = np.arange(sz) + np.random.shuffle(ar1_np) + ar1 = dpt.asarray(ar1_np, dtype=op1_dtype) + ar2_np = np.arange(sz) + np.random.shuffle(ar2_np) + ar2 = dpt.asarray(ar2_np, dtype=op2_dtype) + + r = dpt.maximum(ar1, ar2) + assert isinstance(r, dpt.usm_ndarray) + expected = np.maximum(ar1_np.astype(op1_dtype), ar2_np.astype(op2_dtype)) + + assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q) + assert r.shape == ar1.shape + assert (dpt.asnumpy(r) == expected).all() + assert r.sycl_queue == ar1.sycl_queue + + r = dpt.minimum(ar1, ar2) + assert isinstance(r, dpt.usm_ndarray) + expected = np.minimum(ar1_np.astype(op1_dtype), ar2_np.astype(op2_dtype)) + + assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q) + assert r.shape == ar1.shape + assert (dpt.asnumpy(r) == expected).all() + assert r.sycl_queue == ar1.sycl_queue + + ar3_np = np.arange(sz) + np.random.shuffle(ar3_np) + ar3 = dpt.asarray(ar3_np, dtype=op1_dtype) + ar4_np = np.arange(2 * sz) + np.random.shuffle(ar4_np) + ar4 = dpt.asarray(ar4_np, dtype=op2_dtype) + + r = dpt.maximum(ar3[::-1], ar4[::2]) + assert isinstance(r, dpt.usm_ndarray) + expected = np.maximum( + ar3_np[::-1].astype(op1_dtype), ar4_np[::2].astype(op2_dtype) + ) + + assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q) + assert r.shape == ar3.shape + assert (dpt.asnumpy(r) == expected).all() + + r = dpt.minimum(ar3[::-1], ar4[::2]) + assert isinstance(r, dpt.usm_ndarray) + expected = np.minimum( + ar3_np[::-1].astype(op1_dtype), ar4_np[::2].astype(op2_dtype) + ) + + assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q) + assert r.shape == ar3.shape + assert (dpt.asnumpy(r) == expected).all() + + +@pytest.mark.parametrize("op_dtype", ["c8", "c16"]) +def test_maximum_minimum_complex_matrix(op_dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(op_dtype, q) + + sz = 127 + ar1_np_real = np.random.randint(0, 10, sz) + ar1_np_imag = np.random.randint(0, 10, sz) + ar1 = dpt.asarray(ar1_np_real + 1j * ar1_np_imag, dtype=op_dtype) + + ar2_np_real = np.random.randint(0, 10, sz) + ar2_np_imag = np.random.randint(0, 10, sz) + ar2 = dpt.asarray(ar2_np_real + 1j * ar2_np_imag, dtype=op_dtype) + + r = dpt.maximum(ar1, ar2) + expected = np.maximum(dpt.asnumpy(ar1), dpt.asnumpy(ar2)) + assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q) + assert r.shape == expected.shape + assert_array_equal(dpt.asnumpy(r), expected) + + r1 = dpt.maximum(ar1[::-2], ar2[::2]) + expected1 = np.maximum(dpt.asnumpy(ar1[::-2]), dpt.asnumpy(ar2[::2])) + assert _compare_dtypes(r.dtype, expected1.dtype, sycl_queue=q) + assert r1.shape == expected1.shape + assert_array_equal(dpt.asnumpy(r1), expected1) + + r = dpt.minimum(ar1, ar2) + expected = np.minimum(dpt.asnumpy(ar1), dpt.asnumpy(ar2)) + assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q) + assert r.shape == expected.shape + assert_array_equal(dpt.asnumpy(r), expected) + + r1 = dpt.minimum(ar1[::-2], ar2[::2]) + expected1 = np.minimum(dpt.asnumpy(ar1[::-2]), dpt.asnumpy(ar2[::2])) + assert _compare_dtypes(r.dtype, expected1.dtype, sycl_queue=q) + assert r1.shape == expected1.shape + assert_array_equal(dpt.asnumpy(r1), expected1) + + +@pytest.mark.parametrize("dtype", ["f2", "f4", "f8"]) +def test_maximum_minimum_real_special_cases(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + x = [np.nan, np.inf, -np.inf, 5.0, -3.0] + x = list(itertools.product(x, repeat=2)) + Xnp = np.array([tup[0] for tup in x], dtype=dtype) + Ynp = np.array([tup[1] for tup in x], dtype=dtype) + X = dpt.asarray(Xnp, dtype=dtype) + Y = dpt.asarray(Ynp, dtype=dtype) + + R = dpt.maximum(X, Y) + Rnp = np.maximum(Xnp, Ynp) + assert_array_equal(dpt.asnumpy(R), Rnp) + + R = dpt.minimum(X, Y) + Rnp = np.minimum(Xnp, Ynp) + assert_array_equal(dpt.asnumpy(R), Rnp) + + +@pytest.mark.parametrize("dtype", ["c8", "c16"]) +def test_maximum_minimum_complex_special_cases(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + x = [np.nan, -np.inf, -np.inf, +2.0, -1.0] + x = [complex(*val) for val in itertools.product(x, repeat=2)] + x = list(itertools.product(x, repeat=2)) + + Xnp = np.array([tup[0] for tup in x], dtype=dtype) + Ynp = np.array([tup[1] for tup in x], dtype=dtype) + X = dpt.asarray(Xnp, dtype=dtype, sycl_queue=q) + Y = dpt.asarray(Ynp, dtype=dtype, sycl_queue=q) + + R = dpt.maximum(X, Y) + Rnp = np.maximum(Xnp, Ynp) + assert_array_equal(dpt.asnumpy(dpt.real(R)), np.real(Rnp)) + assert_array_equal(dpt.asnumpy(dpt.imag(R)), np.imag(Rnp)) + + R = dpt.minimum(X, Y) + Rnp = np.minimum(Xnp, Ynp) + assert_array_equal(dpt.asnumpy(dpt.real(R)), np.real(Rnp)) + assert_array_equal(dpt.asnumpy(dpt.imag(R)), np.imag(Rnp)) + + +@pytest.mark.parametrize("op1_usm_type", _usm_types) +@pytest.mark.parametrize("op2_usm_type", _usm_types) +def test_maximum_minimum_usm_type_matrix(op1_usm_type, op2_usm_type): + get_queue_or_skip() + + sz = 128 + ar1_np = np.arange(sz, dtype="i4") + np.random.shuffle(ar1_np) + ar1 = dpt.asarray(ar1_np, usm_type=op1_usm_type) + ar2_np = np.arange(sz, dtype="i4") + np.random.shuffle(ar2_np) + ar2 = dpt.asarray(ar2_np, usm_type=op2_usm_type) + + r = dpt.maximum(ar1, ar2) + assert isinstance(r, dpt.usm_ndarray) + expected_usm_type = dpt.get_coerced_usm_type((op1_usm_type, op2_usm_type)) + assert r.usm_type == expected_usm_type + + r = dpt.minimum(ar1, ar2) + assert isinstance(r, dpt.usm_ndarray) + expected_usm_type = dpt.get_coerced_usm_type((op1_usm_type, op2_usm_type)) + assert r.usm_type == expected_usm_type + + +def test_maximum_minimum_order(): + get_queue_or_skip() + + ar1_np = np.arange(20 * 20, dtype="i4").reshape(20, 20) + np.random.shuffle(ar1_np) + ar1 = dpt.asarray(ar1_np, order="C") + ar2_np = np.arange(20 * 20, dtype="i4").reshape(20, 20) + np.random.shuffle(ar2_np) + ar2 = dpt.asarray(ar2_np, order="C") + + r1 = dpt.maximum(ar1, ar2, order="C") + assert r1.flags.c_contiguous + r2 = dpt.maximum(ar1, ar2, order="F") + assert r2.flags.f_contiguous + r3 = dpt.maximum(ar1, ar2, order="A") + assert r3.flags.c_contiguous + r4 = dpt.maximum(ar1, ar2, order="K") + assert r4.flags.c_contiguous + + ar1 = dpt.asarray(ar1_np, order="F") + ar2 = dpt.asarray(ar2_np, order="F") + r1 = dpt.maximum(ar1, ar2, order="C") + assert r1.flags.c_contiguous + r2 = dpt.maximum(ar1, ar2, order="F") + assert r2.flags.f_contiguous + r3 = dpt.maximum(ar1, ar2, order="A") + assert r3.flags.f_contiguous + r4 = dpt.maximum(ar1, ar2, order="K") + assert r4.flags.f_contiguous + + ar1_np = np.arange(40 * 40, dtype="i4").reshape(40, 40) + np.random.shuffle(ar1_np) + ar1 = dpt.asarray(ar1_np, order="C")[:20, ::-2] + ar2_np = np.arange(40 * 40, dtype="i4").reshape(40, 40) + np.random.shuffle(ar2_np) + ar2 = dpt.asarray(ar2_np, order="C")[:20, ::-2] + r4 = dpt.maximum(ar1, ar2, order="K") + assert r4.strides == (20, -1) + + ar1 = dpt.asarray(ar1_np, order="C")[:20, ::-2].mT + ar2 = dpt.asarray(ar2_np, order="C")[:20, ::-2].mT + r4 = dpt.maximum(ar1, ar2, order="K") + assert r4.strides == (-1, 20) + + +@pytest.mark.parametrize("arr_dt", _all_dtypes) +def test_maximum_minimum_python_scalar(arr_dt): + q = get_queue_or_skip() + skip_if_dtype_not_supported(arr_dt, q) + + X = dpt.zeros((10, 10), dtype=arr_dt, sycl_queue=q) + py_ones = ( + bool(1), + int(1), + float(1), + complex(1), + np.float32(1), + ctypes.c_int(1), + ) + for sc in py_ones: + R = dpt.maximum(X, sc) + assert isinstance(R, dpt.usm_ndarray) + R = dpt.maximum(sc, X) + assert isinstance(R, dpt.usm_ndarray) + + R = dpt.minimum(X, sc) + assert isinstance(R, dpt.usm_ndarray) + R = dpt.minimum(sc, X) + assert isinstance(R, dpt.usm_ndarray) + + +class MockArray: + def __init__(self, arr): + self.data_ = arr + + @property + def __sycl_usm_array_interface__(self): + return self.data_.__sycl_usm_array_interface__ + + +def test_maximum_minimum_mock_array(): + get_queue_or_skip() + a = dpt.arange(10) + b = dpt.ones(10) + c = MockArray(b) + r = dpt.maximum(a, c) + assert isinstance(r, dpt.usm_ndarray) + + r = dpt.minimum(a, c) + assert isinstance(r, dpt.usm_ndarray) + + +def test_maximum_canary_mock_array(): + get_queue_or_skip() + a = dpt.arange(10) + + class Canary: + def __init__(self): + pass + + @property + def __sycl_usm_array_interface__(self): + return None + + c = Canary() + with pytest.raises(ValueError): + dpt.maximum(a, c) + + with pytest.raises(ValueError): + dpt.minimum(a, c) diff --git a/dpnp/tests/tensor/elementwise/test_multiply.py b/dpnp/tests/tensor/elementwise/test_multiply.py new file mode 100644 index 000000000000..33dbef03f347 --- /dev/null +++ b/dpnp/tests/tensor/elementwise/test_multiply.py @@ -0,0 +1,251 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import ctypes + +import dpctl +import numpy as np +import pytest + +import dpnp.tensor as dpt +from dpnp.tensor._type_utils import _can_cast + +from ..helper import ( + get_queue_or_skip, + skip_if_dtype_not_supported, +) +from .utils import ( + _all_dtypes, + _compare_dtypes, + _usm_types, +) + + +@pytest.mark.parametrize("op1_dtype", _all_dtypes) +@pytest.mark.parametrize("op2_dtype", _all_dtypes) +def test_multiply_dtype_matrix(op1_dtype, op2_dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(op1_dtype, q) + skip_if_dtype_not_supported(op2_dtype, q) + + sz = 127 + ar1 = dpt.ones(sz, dtype=op1_dtype) + ar2 = dpt.ones_like(ar1, dtype=op2_dtype) + + r = dpt.multiply(ar1, ar2) + assert isinstance(r, dpt.usm_ndarray) + expected = np.multiply( + np.ones(1, dtype=op1_dtype), np.ones(1, dtype=op2_dtype) + ) + assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q) + assert r.shape == ar1.shape + assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all() + assert r.sycl_queue == ar1.sycl_queue + + ar3 = dpt.ones(sz, dtype=op1_dtype) + ar4 = dpt.ones(2 * sz, dtype=op2_dtype) + + r = dpt.multiply(ar3[::-1], ar4[::2]) + assert isinstance(r, dpt.usm_ndarray) + expected = np.multiply( + np.ones(1, dtype=op1_dtype), np.ones(1, dtype=op2_dtype) + ) + assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q) + assert r.shape == ar3.shape + assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all() + + +@pytest.mark.parametrize("op1_usm_type", _usm_types) +@pytest.mark.parametrize("op2_usm_type", _usm_types) +def test_multiply_usm_type_matrix(op1_usm_type, op2_usm_type): + get_queue_or_skip() + + sz = 128 + ar1 = dpt.ones(sz, dtype="i4", usm_type=op1_usm_type) + ar2 = dpt.ones_like(ar1, dtype="i4", usm_type=op2_usm_type) + + r = dpt.multiply(ar1, ar2) + assert isinstance(r, dpt.usm_ndarray) + expected_usm_type = dpt.get_coerced_usm_type((op1_usm_type, op2_usm_type)) + assert r.usm_type == expected_usm_type + + +def test_multiply_order(): + get_queue_or_skip() + + ar1 = dpt.ones((20, 20), dtype="i4", order="C") + ar2 = dpt.ones((20, 20), dtype="i4", order="C") + r1 = dpt.multiply(ar1, ar2, order="C") + assert r1.flags.c_contiguous + r2 = dpt.multiply(ar1, ar2, order="F") + assert r2.flags.f_contiguous + r3 = dpt.multiply(ar1, ar2, order="A") + assert r3.flags.c_contiguous + r4 = dpt.multiply(ar1, ar2, order="K") + assert r4.flags.c_contiguous + + ar1 = dpt.ones((20, 20), dtype="i4", order="F") + ar2 = dpt.ones((20, 20), dtype="i4", order="F") + r1 = dpt.multiply(ar1, ar2, order="C") + assert r1.flags.c_contiguous + r2 = dpt.multiply(ar1, ar2, order="F") + assert r2.flags.f_contiguous + r3 = dpt.multiply(ar1, ar2, order="A") + assert r3.flags.f_contiguous + r4 = dpt.multiply(ar1, ar2, order="K") + assert r4.flags.f_contiguous + + ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2] + ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2] + r4 = dpt.multiply(ar1, ar2, order="K") + assert r4.strides == (20, -1) + + ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT + ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT + r4 = dpt.multiply(ar1, ar2, order="K") + assert r4.strides == (-1, 20) + + +def test_multiply_broadcasting(): + get_queue_or_skip() + + m = dpt.ones((100, 5), dtype="i4") + v = dpt.arange(1, 6, dtype="i4") + + r = dpt.multiply(m, v) + + expected = np.multiply( + np.ones((100, 5), dtype="i4"), np.arange(1, 6, dtype="i4") + ) + assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all() + + r2 = dpt.multiply(v, m) + expected2 = np.multiply( + np.arange(1, 6, dtype="i4"), np.ones((100, 5), dtype="i4") + ) + assert (dpt.asnumpy(r2) == expected2.astype(r2.dtype)).all() + + +@pytest.mark.parametrize("arr_dt", _all_dtypes) +def test_multiply_python_scalar(arr_dt): + q = get_queue_or_skip() + skip_if_dtype_not_supported(arr_dt, q) + + X = dpt.ones((10, 10), dtype=arr_dt, sycl_queue=q) + py_ones = ( + bool(1), + int(1), + float(1), + complex(1), + np.float32(1), + ctypes.c_int(1), + ) + for sc in py_ones: + R = dpt.multiply(X, sc) + assert isinstance(R, dpt.usm_ndarray) + R = dpt.multiply(sc, X) + assert isinstance(R, dpt.usm_ndarray) + + +@pytest.mark.parametrize("arr_dt", _all_dtypes) +@pytest.mark.parametrize("sc", [bool(1), int(1), float(1), complex(1)]) +def test_multiply_python_scalar_gh1219(arr_dt, sc): + q = get_queue_or_skip() + skip_if_dtype_not_supported(arr_dt, q) + + Xnp = np.ones(4, dtype=arr_dt) + + X = dpt.ones(4, dtype=arr_dt, sycl_queue=q) + + R = dpt.multiply(X, sc) + Rnp = np.multiply(Xnp, sc) + assert _compare_dtypes(R.dtype, Rnp.dtype, sycl_queue=q) + + # symmetric case + R = dpt.multiply(sc, X) + Rnp = np.multiply(sc, Xnp) + assert _compare_dtypes(R.dtype, Rnp.dtype, sycl_queue=q) + + +@pytest.mark.parametrize("dtype", _all_dtypes) +def test_multiply_inplace_python_scalar(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + X = dpt.ones((10, 10), dtype=dtype, sycl_queue=q) + dt_kind = X.dtype.kind + if dt_kind in "ui": + X *= int(1) + elif dt_kind == "f": + X *= float(1) + elif dt_kind == "c": + X *= complex(1) + elif dt_kind == "b": + X *= bool(1) + + +@pytest.mark.parametrize("op1_dtype", _all_dtypes) +@pytest.mark.parametrize("op2_dtype", _all_dtypes) +def test_multiply_inplace_dtype_matrix(op1_dtype, op2_dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(op1_dtype, q) + skip_if_dtype_not_supported(op2_dtype, q) + + sz = 127 + ar1 = dpt.ones(sz, dtype=op1_dtype) + ar2 = dpt.ones_like(ar1, dtype=op2_dtype) + + dev = q.sycl_device + _fp16 = dev.has_aspect_fp16 + _fp64 = dev.has_aspect_fp64 + if _can_cast(ar2.dtype, ar1.dtype, _fp16, _fp64, casting="same_kind"): + ar1 *= ar2 + assert ( + dpt.asnumpy(ar1) == np.full(ar1.shape, 1, dtype=ar1.dtype) + ).all() + + ar3 = dpt.ones(sz, dtype=op1_dtype) + ar4 = dpt.ones(2 * sz, dtype=op2_dtype) + + ar3[::-1] *= ar4[::2] + assert ( + dpt.asnumpy(ar3) == np.full(ar3.shape, 1, dtype=ar3.dtype) + ).all() + + else: + with pytest.raises(ValueError): + ar1 *= ar2 + + +def test_multiply_inplace_broadcasting(): + get_queue_or_skip() + + m = dpt.ones((100, 5), dtype="i4") + v = dpt.arange(5, dtype="i4") + + m *= v + assert (dpt.asnumpy(m) == np.arange(0, 5, dtype="i4")[np.newaxis, :]).all() diff --git a/dpnp/tests/tensor/elementwise/test_negative.py b/dpnp/tests/tensor/elementwise/test_negative.py new file mode 100644 index 000000000000..9713f0ecb364 --- /dev/null +++ b/dpnp/tests/tensor/elementwise/test_negative.py @@ -0,0 +1,101 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import itertools + +import numpy as np +import pytest + +import dpnp.tensor as dpt + +from ..helper import ( + get_queue_or_skip, + skip_if_dtype_not_supported, +) +from .utils import _all_dtypes, _usm_types + + +@pytest.mark.parametrize("dtype", _all_dtypes[1:]) +def test_negative_out_type(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + arg_dt = np.dtype(dtype) + X = dpt.asarray(0, dtype=arg_dt, sycl_queue=q) + assert dpt.negative(X).dtype == arg_dt + + r = dpt.empty_like(X, dtype=arg_dt) + dpt.negative(X, out=r) + assert np.allclose(dpt.asnumpy(r), dpt.asnumpy(dpt.negative(X))) + + +def test_negative_bool(): + get_queue_or_skip() + x = dpt.ones(64, dtype="?") + with pytest.raises(ValueError): + dpt.negative(x) + + +@pytest.mark.parametrize("usm_type", _usm_types) +def test_negative_usm_type(usm_type): + q = get_queue_or_skip() + + arg_dt = np.dtype("i4") + input_shape = (10, 10, 10, 10) + X = dpt.empty(input_shape, dtype=arg_dt, usm_type=usm_type, sycl_queue=q) + X[..., 0::2] = 1 + X[..., 1::2] = 0 + + Y = dpt.negative(X) + assert Y.usm_type == X.usm_type + assert Y.sycl_queue == X.sycl_queue + assert Y.flags.c_contiguous + + expected_Y = np.negative(dpt.asnumpy(X)) + assert np.allclose(dpt.asnumpy(Y), expected_Y) + + +@pytest.mark.parametrize("dtype", _all_dtypes[1:]) +def test_negative_order(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + arg_dt = np.dtype(dtype) + input_shape = (10, 10, 10, 10) + X = dpt.empty(input_shape, dtype=arg_dt, sycl_queue=q) + X[..., 0::2] = 1 + X[..., 1::2] = 0 + + for perms in itertools.permutations(range(4)): + U = dpt.permute_dims(X[:, ::-1, ::-1, :], perms) + expected_Y = np.negative(np.ones(U.shape, dtype=U.dtype)) + expected_Y[..., 1::2] = 0 + expected_Y = np.transpose(expected_Y, perms) + for ord in ["C", "F", "A", "K"]: + Y = dpt.negative(U, order=ord) + assert np.allclose(dpt.asnumpy(Y), expected_Y) diff --git a/dpnp/tests/tensor/elementwise/test_nextafter.py b/dpnp/tests/tensor/elementwise/test_nextafter.py new file mode 100644 index 000000000000..b904bc42c6b7 --- /dev/null +++ b/dpnp/tests/tensor/elementwise/test_nextafter.py @@ -0,0 +1,169 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import ctypes + +import dpctl +import numpy as np +import pytest + +import dpnp.tensor as dpt + +from ..helper import ( + get_queue_or_skip, + skip_if_dtype_not_supported, +) +from .utils import ( + _compare_dtypes, + _no_complex_dtypes, +) + + +@pytest.mark.parametrize("op1_dtype", _no_complex_dtypes[1:]) +@pytest.mark.parametrize("op2_dtype", _no_complex_dtypes[1:]) +def test_nextafter_dtype_matrix(op1_dtype, op2_dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(op1_dtype, q) + skip_if_dtype_not_supported(op2_dtype, q) + + sz = 127 + ar1 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q) + ar2 = dpt.ones_like(ar1, dtype=op2_dtype, sycl_queue=q) + + r = dpt.nextafter(ar1, ar2) + assert isinstance(r, dpt.usm_ndarray) + expected = np.nextafter( + np.ones(sz, dtype=op1_dtype), np.ones(sz, dtype=op2_dtype) + ) + assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q) + assert r.shape == ar1.shape + assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all() + assert r.sycl_queue == ar1.sycl_queue + + ar3 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q) + ar4 = dpt.ones(2 * sz, dtype=op2_dtype, sycl_queue=q) + + r = dpt.nextafter(ar3[::-1], ar4[::2]) + assert isinstance(r, dpt.usm_ndarray) + expected = np.nextafter( + np.ones(sz, dtype=op1_dtype), np.ones(sz, dtype=op2_dtype) + ) + assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q) + assert r.shape == ar3.shape + assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all() + + +@pytest.mark.parametrize("arr_dt", _no_complex_dtypes[1:]) +def test_nextafter_python_scalar(arr_dt): + q = get_queue_or_skip() + skip_if_dtype_not_supported(arr_dt, q) + + X = dpt.ones((10, 10), dtype=arr_dt, sycl_queue=q) + py_ones = ( + bool(1), + int(1), + float(1), + np.float32(1), + ctypes.c_int(1), + ) + for sc in py_ones: + R = dpt.nextafter(X, sc) + assert isinstance(R, dpt.usm_ndarray) + R = dpt.nextafter(sc, X) + assert isinstance(R, dpt.usm_ndarray) + + +@pytest.mark.parametrize("dt", ["f2", "f4", "f8"]) +def test_nextafter_special_cases_nan(dt): + """If either x1_i or x2_i is NaN, the result is NaN.""" + q = get_queue_or_skip() + skip_if_dtype_not_supported(dt, q) + + x1 = dpt.asarray([2.0, dpt.nan, dpt.nan], dtype=dt) + x2 = dpt.asarray([dpt.nan, 2.0, dpt.nan], dtype=dt) + + y = dpt.nextafter(x1, x2) + assert dpt.all(dpt.isnan(y)) + + +@pytest.mark.parametrize("dt", ["f2", "f4", "f8"]) +def test_nextafter_special_cases_zero(dt): + """If x1_i is equal to x2_i, the result is x2_i.""" + q = get_queue_or_skip() + skip_if_dtype_not_supported(dt, q) + + x1 = dpt.asarray([-0.0, 0.0, -0.0, 0.0], dtype=dt) + x2 = dpt.asarray([0.0, -0.0, -0.0, 0.0], dtype=dt) + + y = dpt.nextafter(x1, x2) + assert dpt.all(y == 0) + + skip_checking_signs = ( + x1.dtype == dpt.float16 + and x1.sycl_device.backend == dpctl.backend_type.cuda + ) + if skip_checking_signs: + pytest.skip( + "Skipped checking signs for nextafter due to " + "known issue in DPC++ support for CUDA devices" + ) + else: + assert dpt.all(dpt.signbit(y) == dpt.signbit(x2)) + + +@pytest.mark.parametrize("dt", ["f2", "f4", "f8"]) +def test_nextafter_basic(dt): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dt, q) + + s = 10 + x1 = dpt.ones(s, dtype=dt, sycl_queue=q) + x2 = dpt.full(s, 2, dtype=dt, sycl_queue=q) + + r = dpt.nextafter(x1, x2) + expected_diff = dpt.asarray(dpt.finfo(dt).eps, dtype=dt, sycl_queue=q) + + assert dpt.all(r > 0) + assert dpt.all(r - x1 == expected_diff) + + x3 = dpt.zeros(s, dtype=dt, sycl_queue=q) + + r = dpt.nextafter(x3, x1) + assert dpt.all(r > 0) + + r = dpt.nextafter(x1, x3) + assert dpt.all((r - x1) < 0) + + r = dpt.nextafter(x1, 0) + assert dpt.all(x1 - r == (expected_diff) / 2) + + r = dpt.nextafter(x3, dpt.inf) + assert dpt.all(r > 0) + + r = dpt.nextafter(x3, -dpt.inf) + assert dpt.all(r < 0) diff --git a/dpnp/tests/tensor/elementwise/test_not_equal.py b/dpnp/tests/tensor/elementwise/test_not_equal.py new file mode 100644 index 000000000000..3f0eb58cf8b7 --- /dev/null +++ b/dpnp/tests/tensor/elementwise/test_not_equal.py @@ -0,0 +1,225 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import ctypes + +import dpctl +import numpy as np +import pytest + +import dpnp.tensor as dpt + +from ..helper import ( + get_queue_or_skip, + skip_if_dtype_not_supported, +) +from .utils import ( + _all_dtypes, + _compare_dtypes, + _usm_types, +) + + +@pytest.mark.parametrize("op1_dtype", _all_dtypes) +@pytest.mark.parametrize("op2_dtype", _all_dtypes) +def test_not_equal_dtype_matrix(op1_dtype, op2_dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(op1_dtype, q) + skip_if_dtype_not_supported(op2_dtype, q) + + sz = 127 + ar1 = dpt.ones(sz, dtype=op1_dtype) + ar2 = dpt.ones_like(ar1, dtype=op2_dtype) + + r = dpt.not_equal(ar1, ar2) + assert isinstance(r, dpt.usm_ndarray) + expected_dtype = np.not_equal( + np.zeros(1, dtype=op1_dtype), np.zeros(1, dtype=op2_dtype) + ).dtype + assert _compare_dtypes(r.dtype, expected_dtype, sycl_queue=q) + assert r.shape == ar1.shape + assert (dpt.asnumpy(r) == np.full(r.shape, False, dtype=r.dtype)).all() + assert r.sycl_queue == ar1.sycl_queue + + ar3 = dpt.ones(sz, dtype=op1_dtype) + ar4 = dpt.ones(2 * sz, dtype=op2_dtype) + + r = dpt.not_equal(ar3[::-1], ar4[::2]) + assert isinstance(r, dpt.usm_ndarray) + expected_dtype = np.not_equal( + np.ones(1, dtype=op1_dtype), np.ones(1, dtype=op2_dtype) + ).dtype + assert _compare_dtypes(r.dtype, expected_dtype, sycl_queue=q) + assert r.shape == ar3.shape + assert (dpt.asnumpy(r) == np.full(r.shape, False, dtype=r.dtype)).all() + + +@pytest.mark.parametrize("op1_usm_type", _usm_types) +@pytest.mark.parametrize("op2_usm_type", _usm_types) +def test_not_equal_usm_type_matrix(op1_usm_type, op2_usm_type): + get_queue_or_skip() + + sz = 128 + ar1 = dpt.ones(sz, dtype="i4", usm_type=op1_usm_type) + ar2 = dpt.ones_like(ar1, dtype="i4", usm_type=op2_usm_type) + + r = dpt.not_equal(ar1, ar2) + assert isinstance(r, dpt.usm_ndarray) + expected_usm_type = dpt.get_coerced_usm_type((op1_usm_type, op2_usm_type)) + assert r.usm_type == expected_usm_type + + +def test_not_equal_order(): + get_queue_or_skip() + + ar1 = dpt.ones((20, 20), dtype="i4", order="C") + ar2 = dpt.ones((20, 20), dtype="i4", order="C") + r1 = dpt.not_equal(ar1, ar2, order="C") + assert r1.flags.c_contiguous + r2 = dpt.not_equal(ar1, ar2, order="F") + assert r2.flags.f_contiguous + r3 = dpt.not_equal(ar1, ar2, order="A") + assert r3.flags.c_contiguous + r4 = dpt.not_equal(ar1, ar2, order="K") + assert r4.flags.c_contiguous + + ar1 = dpt.ones((20, 20), dtype="i4", order="F") + ar2 = dpt.ones((20, 20), dtype="i4", order="F") + r1 = dpt.not_equal(ar1, ar2, order="C") + assert r1.flags.c_contiguous + r2 = dpt.not_equal(ar1, ar2, order="F") + assert r2.flags.f_contiguous + r3 = dpt.not_equal(ar1, ar2, order="A") + assert r3.flags.f_contiguous + r4 = dpt.not_equal(ar1, ar2, order="K") + assert r4.flags.f_contiguous + + ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2] + ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2] + r4 = dpt.not_equal(ar1, ar2, order="K") + assert r4.strides == (20, -1) + + ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT + ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT + r4 = dpt.not_equal(ar1, ar2, order="K") + assert r4.strides == (-1, 20) + + +def test_not_equal_broadcasting(): + get_queue_or_skip() + + m = dpt.ones((100, 5), dtype="i4") + v = dpt.arange(5, dtype="i4") + + r = dpt.not_equal(m, v) + expected = np.full((100, 5), [True, False, True, True, True], dtype="?") + + assert (dpt.asnumpy(r) == expected).all() + + r2 = dpt.not_equal(v, m) + assert (dpt.asnumpy(r2) == expected).all() + + r3 = dpt.empty_like(m, dtype="?") + dpt.not_equal(m, v, out=r3) + assert (dpt.asnumpy(r3) == expected).all() + + +@pytest.mark.parametrize("arr_dt", _all_dtypes) +def test_not_equal_python_scalar(arr_dt): + q = get_queue_or_skip() + skip_if_dtype_not_supported(arr_dt, q) + + X = dpt.zeros((10, 10), dtype=arr_dt, sycl_queue=q) + py_zeros = ( + bool(0), + int(0), + float(0), + complex(0), + np.float32(0), + ctypes.c_int(0), + ) + for sc in py_zeros: + R = dpt.not_equal(X, sc) + assert isinstance(R, dpt.usm_ndarray) + assert not dpt.all(R) + R = dpt.not_equal(sc, X) + assert isinstance(R, dpt.usm_ndarray) + assert not dpt.all(R) + + +class MockArray: + def __init__(self, arr): + self.data_ = arr + + @property + def __sycl_usm_array_interface__(self): + return self.data_.__sycl_usm_array_interface__ + + +def test_not_equal_mock_array(): + get_queue_or_skip() + a = dpt.arange(10) + b = dpt.ones(10) + c = MockArray(b) + r = dpt.not_equal(a, c) + assert isinstance(r, dpt.usm_ndarray) + + +def test_not_equal_canary_mock_array(): + get_queue_or_skip() + a = dpt.arange(10) + + class Canary: + def __init__(self): + pass + + @property + def __sycl_usm_array_interface__(self): + return None + + c = Canary() + with pytest.raises(ValueError): + dpt.not_equal(a, c) + + +@pytest.mark.parametrize("dtype", _all_dtypes) +def test_not_equal_alignment(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + n = 256 + s = dpt.concat((dpt.zeros(n, dtype=dtype), dpt.zeros(n, dtype=dtype))) + + mask = s[:-1] != s[1:] + (pos,) = dpt.nonzero(mask) + assert dpt.all(pos == n) + + out_arr = dpt.zeros(2 * n, dtype=mask.dtype) + dpt.not_equal(s[:-1], s[1:], out=out_arr[1:]) + (pos,) = dpt.nonzero(mask) + assert dpt.all(pos == (n + 1)) diff --git a/dpnp/tests/tensor/elementwise/test_positive.py b/dpnp/tests/tensor/elementwise/test_positive.py new file mode 100644 index 000000000000..d4358e5827da --- /dev/null +++ b/dpnp/tests/tensor/elementwise/test_positive.py @@ -0,0 +1,94 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import itertools + +import numpy as np +import pytest + +import dpnp.tensor as dpt + +from ..helper import ( + get_queue_or_skip, + skip_if_dtype_not_supported, +) +from .utils import _all_dtypes, _usm_types + + +@pytest.mark.parametrize("dtype", _all_dtypes[1:]) +def test_positive_out_type(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + arg_dt = np.dtype(dtype) + X = dpt.asarray(0, dtype=arg_dt, sycl_queue=q) + assert dpt.positive(X).dtype == arg_dt + + r = dpt.empty_like(X, dtype=arg_dt) + dpt.positive(X, out=r) + assert np.allclose(dpt.asnumpy(r), dpt.asnumpy(dpt.positive(X))) + + +@pytest.mark.parametrize("usm_type", _usm_types) +def test_positive_usm_type(usm_type): + q = get_queue_or_skip() + + arg_dt = np.dtype("i4") + input_shape = (10, 10, 10, 10) + X = dpt.empty(input_shape, dtype=arg_dt, usm_type=usm_type, sycl_queue=q) + X[..., 0::2] = 1 + X[..., 1::2] = 0 + + Y = dpt.positive(X) + assert Y.usm_type == X.usm_type + assert Y.sycl_queue == X.sycl_queue + assert Y.flags.c_contiguous + + expected_Y = dpt.asnumpy(X) + assert np.allclose(dpt.asnumpy(Y), expected_Y) + + +@pytest.mark.parametrize("dtype", _all_dtypes[1:]) +def test_positive_order(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + arg_dt = np.dtype(dtype) + input_shape = (10, 10, 10, 10) + X = dpt.empty(input_shape, dtype=arg_dt, sycl_queue=q) + X[..., 0::2] = 1 + X[..., 1::2] = 0 + + for perms in itertools.permutations(range(4)): + U = dpt.permute_dims(X[:, ::-1, ::-1, :], perms) + expected_Y = np.ones(U.shape, dtype=U.dtype) + expected_Y[..., 1::2] = 0 + expected_Y = np.transpose(expected_Y, perms) + for ord in ["C", "F", "A", "K"]: + Y = dpt.positive(U, order=ord) + assert np.allclose(dpt.asnumpy(Y), expected_Y) diff --git a/dpnp/tests/tensor/elementwise/test_pow.py b/dpnp/tests/tensor/elementwise/test_pow.py new file mode 100644 index 000000000000..c68e6ad13b0a --- /dev/null +++ b/dpnp/tests/tensor/elementwise/test_pow.py @@ -0,0 +1,229 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import ctypes + +import dpctl +import numpy as np +import pytest + +import dpnp.tensor as dpt +from dpnp.tensor._type_utils import _can_cast + +from ..helper import ( + get_queue_or_skip, + skip_if_dtype_not_supported, +) +from .utils import ( + _all_dtypes, + _compare_dtypes, + _usm_types, +) + + +@pytest.mark.parametrize("op1_dtype", _all_dtypes[1:]) +@pytest.mark.parametrize("op2_dtype", _all_dtypes[1:]) +def test_power_dtype_matrix(op1_dtype, op2_dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(op1_dtype, q) + skip_if_dtype_not_supported(op2_dtype, q) + + sz = 127 + ar1 = dpt.ones(sz, dtype=op1_dtype) + ar2 = dpt.ones_like(ar1, dtype=op2_dtype) + + r = dpt.pow(ar1, ar2) + assert isinstance(r, dpt.usm_ndarray) + expected = np.power( + np.ones(1, dtype=op1_dtype), np.ones(1, dtype=op2_dtype) + ) + assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q) + assert r.shape == ar1.shape + assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all() + assert r.sycl_queue == ar1.sycl_queue + + ar3 = dpt.ones(sz, dtype=op1_dtype) + ar4 = dpt.ones(2 * sz, dtype=op2_dtype) + + r = dpt.pow(ar3[::-1], ar4[::2]) + assert isinstance(r, dpt.usm_ndarray) + expected = np.power( + np.ones(1, dtype=op1_dtype), np.ones(1, dtype=op2_dtype) + ) + assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q) + assert r.shape == ar3.shape + assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all() + + +@pytest.mark.parametrize("op1_usm_type", _usm_types) +@pytest.mark.parametrize("op2_usm_type", _usm_types) +def test_power_usm_type_matrix(op1_usm_type, op2_usm_type): + get_queue_or_skip() + + sz = 128 + ar1 = dpt.ones(sz, dtype="i4", usm_type=op1_usm_type) + ar2 = dpt.ones_like(ar1, dtype="i4", usm_type=op2_usm_type) + + r = dpt.pow(ar1, ar2) + assert isinstance(r, dpt.usm_ndarray) + expected_usm_type = dpt.get_coerced_usm_type((op1_usm_type, op2_usm_type)) + assert r.usm_type == expected_usm_type + + +def test_pow_order(): + get_queue_or_skip() + + ar1 = dpt.ones((20, 20), dtype="i4", order="C") + ar2 = dpt.ones((20, 20), dtype="i4", order="C") + r1 = dpt.pow(ar1, ar2, order="C") + assert r1.flags.c_contiguous + r2 = dpt.pow(ar1, ar2, order="F") + assert r2.flags.f_contiguous + r3 = dpt.pow(ar1, ar2, order="A") + assert r3.flags.c_contiguous + r4 = dpt.pow(ar1, ar2, order="K") + assert r4.flags.c_contiguous + + ar1 = dpt.ones((20, 20), dtype="i4", order="F") + ar2 = dpt.ones((20, 20), dtype="i4", order="F") + r1 = dpt.pow(ar1, ar2, order="C") + assert r1.flags.c_contiguous + r2 = dpt.pow(ar1, ar2, order="F") + assert r2.flags.f_contiguous + r3 = dpt.pow(ar1, ar2, order="A") + assert r3.flags.f_contiguous + r4 = dpt.pow(ar1, ar2, order="K") + assert r4.flags.f_contiguous + + ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2] + ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2] + r4 = dpt.pow(ar1, ar2, order="K") + assert r4.strides == (20, -1) + + ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT + ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT + r4 = dpt.pow(ar1, ar2, order="K") + assert r4.strides == (-1, 20) + + +def test_pow_broadcasting(): + get_queue_or_skip() + + v = dpt.arange(1, 6, dtype="i4") + m = dpt.full((100, 5), 2, dtype="i4") + + r = dpt.pow(m, v) + + expected = np.power( + np.full((100, 5), 2, dtype="i4"), np.arange(1, 6, dtype="i4") + ) + assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all() + + r2 = dpt.pow(v, m) + expected2 = np.power( + np.arange(1, 6, dtype="i4"), np.full((100, 5), 2, dtype="i4") + ) + assert (dpt.asnumpy(r2) == expected2.astype(r2.dtype)).all() + + +@pytest.mark.parametrize("arr_dt", _all_dtypes) +def test_pow_python_scalar(arr_dt): + q = get_queue_or_skip() + skip_if_dtype_not_supported(arr_dt, q) + + X = dpt.ones((10, 10), dtype=arr_dt, sycl_queue=q) + py_ones = ( + bool(1), + int(1), + float(1), + complex(1), + np.float32(1), + ctypes.c_int(1), + ) + for sc in py_ones: + R = dpt.pow(X, sc) + assert isinstance(R, dpt.usm_ndarray) + R = dpt.pow(sc, X) + assert isinstance(R, dpt.usm_ndarray) + + +@pytest.mark.parametrize("dtype", _all_dtypes[1:]) +def test_pow_inplace_python_scalar(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + X = dpt.ones((10, 10), dtype=dtype, sycl_queue=q) + dt_kind = X.dtype.kind + if dt_kind in "ui": + X **= int(1) + elif dt_kind == "f": + X **= float(1) + elif dt_kind == "c": + X **= complex(1) + + +@pytest.mark.parametrize("op1_dtype", _all_dtypes[1:]) +@pytest.mark.parametrize("op2_dtype", _all_dtypes[1:]) +def test_pow_inplace_dtype_matrix(op1_dtype, op2_dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(op1_dtype, q) + skip_if_dtype_not_supported(op2_dtype, q) + + sz = 127 + ar1 = dpt.ones(sz, dtype=op1_dtype) + ar2 = dpt.ones_like(ar1, dtype=op2_dtype) + + dev = q.sycl_device + _fp16 = dev.has_aspect_fp16 + _fp64 = dev.has_aspect_fp64 + if _can_cast(ar2.dtype, ar1.dtype, _fp16, _fp64, casting="same_kind"): + ar1 **= ar2 + assert ( + dpt.asnumpy(ar1) == np.full(ar1.shape, 1, dtype=ar1.dtype) + ).all() + + ar3 = dpt.ones(sz, dtype=op1_dtype) + ar4 = dpt.ones(2 * sz, dtype=op2_dtype) + + ar3[::-1] *= ar4[::2] + assert ( + dpt.asnumpy(ar3) == np.full(ar3.shape, 1, dtype=ar3.dtype) + ).all() + + else: + with pytest.raises(ValueError): + ar1 **= ar2 + + +def test_pow_inplace_basic(): + get_queue_or_skip() + + x = dpt.arange(10, dtype="i4") + expected = dpt.square(x) + x **= 2 + + assert dpt.all(x == expected) diff --git a/dpnp/tests/tensor/elementwise/test_reciprocal.py b/dpnp/tests/tensor/elementwise/test_reciprocal.py new file mode 100644 index 000000000000..dd31c3323f68 --- /dev/null +++ b/dpnp/tests/tensor/elementwise/test_reciprocal.py @@ -0,0 +1,108 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import itertools + +import pytest + +import dpnp.tensor as dpt + +from ..helper import ( + get_queue_or_skip, + skip_if_dtype_not_supported, +) +from .utils import _all_dtypes, _complex_fp_dtypes + + +@pytest.mark.parametrize("dtype", _all_dtypes) +def test_reciprocal_out_type(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + x = dpt.asarray(1, dtype=dtype, sycl_queue=q) + one = dpt.asarray(1, dtype=dtype, sycl_queue=q) + expected_dtype = dpt.divide(one, x).dtype + assert dpt.reciprocal(x).dtype == expected_dtype + + +@pytest.mark.parametrize("dtype", _all_dtypes) +def test_reciprocal_output_contig(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + n_seq = 1027 + + x = dpt.linspace(1, 13, num=n_seq, dtype=dtype, sycl_queue=q) + res = dpt.reciprocal(x) + expected = 1 / x + tol = 8 * dpt.finfo(res.dtype).resolution + assert dpt.allclose(res, expected, atol=tol, rtol=tol) + + +@pytest.mark.parametrize("dtype", _all_dtypes) +def test_reciprocal_output_strided(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + n_seq = 2054 + + x = dpt.linspace(1, 13, num=n_seq, dtype=dtype, sycl_queue=q)[::-2] + res = dpt.reciprocal(x) + expected = 1 / x + tol = 8 * dpt.finfo(res.dtype).resolution + assert dpt.allclose(res, expected, atol=tol, rtol=tol) + + +def test_reciprocal_special_cases(): + get_queue_or_skip() + + x = dpt.asarray([dpt.nan, 0.0, -0.0, dpt.inf, -dpt.inf], dtype="f4") + res = dpt.reciprocal(x) + expected = dpt.asarray([dpt.nan, dpt.inf, -dpt.inf, 0.0, -0.0], dtype="f4") + assert dpt.allclose(res, expected, equal_nan=True) + + +@pytest.mark.parametrize("dtype", _complex_fp_dtypes) +def test_reciprocal_complex_special_cases(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + nans_ = [dpt.nan, -dpt.nan] + infs_ = [dpt.inf, -dpt.inf] + finites_ = [-1.0, -0.0, 0.0, 1.0] + inps_ = nans_ + infs_ + finites_ + c_ = [complex(*v) for v in itertools.product(inps_, repeat=2)] + + z = dpt.asarray(c_, dtype=dtype) + r = dpt.reciprocal(z) + + expected = 1 / z + + tol = dpt.finfo(r.dtype).resolution + + assert dpt.allclose(r, expected, atol=tol, rtol=tol, equal_nan=True) diff --git a/dpnp/tests/tensor/elementwise/test_remainder.py b/dpnp/tests/tensor/elementwise/test_remainder.py new file mode 100644 index 000000000000..b8d5ca1cf8ae --- /dev/null +++ b/dpnp/tests/tensor/elementwise/test_remainder.py @@ -0,0 +1,277 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import ctypes + +import dpctl +import numpy as np +import pytest + +import dpnp.tensor as dpt +from dpnp.tensor._type_utils import _can_cast + +from ..helper import ( + get_queue_or_skip, + skip_if_dtype_not_supported, +) +from .utils import ( + _compare_dtypes, + _no_complex_dtypes, + _usm_types, +) + + +@pytest.mark.parametrize("op1_dtype", _no_complex_dtypes) +@pytest.mark.parametrize("op2_dtype", _no_complex_dtypes) +def test_remainder_dtype_matrix(op1_dtype, op2_dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(op1_dtype, q) + skip_if_dtype_not_supported(op2_dtype, q) + + sz = 127 + ar1 = dpt.ones(sz, dtype=op1_dtype) + ar2 = dpt.ones_like(ar1, dtype=op2_dtype) + + r = dpt.remainder(ar1, ar2) + assert isinstance(r, dpt.usm_ndarray) + expected = np.remainder( + np.ones(1, dtype=op1_dtype), np.ones(1, dtype=op2_dtype) + ) + assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q) + assert r.shape == ar1.shape + assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all() + assert r.sycl_queue == ar1.sycl_queue + + ar3 = dpt.ones(sz, dtype=op1_dtype) + ar4 = dpt.ones(2 * sz, dtype=op2_dtype) + + r = dpt.remainder(ar3[::-1], ar4[::2]) + assert isinstance(r, dpt.usm_ndarray) + expected = np.remainder( + np.ones(1, dtype=op1_dtype), np.ones(1, dtype=op2_dtype) + ) + assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q) + assert r.shape == ar3.shape + assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all() + + +@pytest.mark.parametrize("op1_usm_type", _usm_types) +@pytest.mark.parametrize("op2_usm_type", _usm_types) +def test_remainder_usm_type_matrix(op1_usm_type, op2_usm_type): + get_queue_or_skip() + + sz = 128 + ar1 = dpt.ones(sz, dtype="i4", usm_type=op1_usm_type) + ar2 = dpt.ones_like(ar1, dtype="i4", usm_type=op2_usm_type) + + r = dpt.remainder(ar1, ar2) + assert isinstance(r, dpt.usm_ndarray) + expected_usm_type = dpt.get_coerced_usm_type((op1_usm_type, op2_usm_type)) + assert r.usm_type == expected_usm_type + + +def test_remainder_order(): + get_queue_or_skip() + + ar1 = dpt.ones((20, 20), dtype="i4", order="C") + ar2 = dpt.ones((20, 20), dtype="i4", order="C") + r1 = dpt.remainder(ar1, ar2, order="C") + assert r1.flags.c_contiguous + r2 = dpt.remainder(ar1, ar2, order="F") + assert r2.flags.f_contiguous + r3 = dpt.remainder(ar1, ar2, order="A") + assert r3.flags.c_contiguous + r4 = dpt.remainder(ar1, ar2, order="K") + assert r4.flags.c_contiguous + + ar1 = dpt.ones((20, 20), dtype="i4", order="F") + ar2 = dpt.ones((20, 20), dtype="i4", order="F") + r1 = dpt.remainder(ar1, ar2, order="C") + assert r1.flags.c_contiguous + r2 = dpt.remainder(ar1, ar2, order="F") + assert r2.flags.f_contiguous + r3 = dpt.remainder(ar1, ar2, order="A") + assert r3.flags.f_contiguous + r4 = dpt.remainder(ar1, ar2, order="K") + assert r4.flags.f_contiguous + + ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2] + ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2] + r4 = dpt.remainder(ar1, ar2, order="K") + assert r4.strides == (20, -1) + + ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT + ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT + r4 = dpt.remainder(ar1, ar2, order="K") + assert r4.strides == (-1, 20) + + +@pytest.mark.parametrize("dt", _no_complex_dtypes[1:8:2]) +def test_remainder_negative_integers(dt): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dt, q) + + x = dpt.arange(-5, -1, 1, dtype=dt, sycl_queue=q) + x_np = np.arange(-5, -1, 1, dtype=dt) + val = 3 + + r1 = dpt.remainder(x, val) + expected = np.remainder(x_np, val) + assert (dpt.asnumpy(r1) == expected).all() + + r2 = dpt.remainder(val, x) + expected = np.remainder(val, x_np) + assert (dpt.asnumpy(r2) == expected).all() + + +def test_remainder_integer_zero(): + get_queue_or_skip() + + for dt in ["i4", "u4"]: + x = dpt.ones(1, dtype=dt) + y = dpt.zeros_like(x) + + assert (dpt.asnumpy(dpt.remainder(x, y)) == np.zeros(1, dtype=dt)).all() + + x = dpt.astype(x, dt) + y = dpt.zeros_like(x) + + assert (dpt.asnumpy(dpt.remainder(x, y)) == np.zeros(1, dtype=dt)).all() + + +@pytest.mark.parametrize("dt", _no_complex_dtypes[9:]) +def test_remainder_negative_floats(dt): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dt, q) + + x = dpt.linspace(-5, 5, 20, dtype=dt, sycl_queue=q) + x_np = np.linspace(-5, 5, 20, dtype=dt) + val = 3 + + tol = 8 * dpt.finfo(dt).resolution + + r1 = dpt.remainder(x, val) + expected = np.remainder(x_np, val) + with np.errstate(invalid="ignore"): + np.allclose( + dpt.asnumpy(r1), expected, rtol=tol, atol=tol, equal_nan=True + ) + + r2 = dpt.remainder(val, x) + expected = np.remainder(val, x_np) + with np.errstate(invalid="ignore"): + np.allclose( + dpt.asnumpy(r2), expected, rtol=tol, atol=tol, equal_nan=True + ) + + +def test_remainder_special_cases(): + get_queue_or_skip() + + lhs = [dpt.nan, dpt.inf, 0.0, -0.0, -0.0, 1.0, dpt.inf, -dpt.inf] + rhs = [dpt.nan, dpt.inf, -0.0, 1.0, 1.0, 0.0, 1.0, -1.0] + + x, y = dpt.asarray(lhs, dtype="f4"), dpt.asarray(rhs, dtype="f4") + + x_np, y_np = np.asarray(lhs, dtype="f4"), np.asarray(rhs, dtype="f4") + + res = dpt.remainder(x, y) + + with np.errstate(invalid="ignore"): + np.allclose(dpt.asnumpy(res), np.remainder(x_np, y_np)) + + +@pytest.mark.parametrize("arr_dt", _no_complex_dtypes) +def test_remainder_python_scalar(arr_dt): + q = get_queue_or_skip() + skip_if_dtype_not_supported(arr_dt, q) + + X = dpt.ones((10, 10), dtype=arr_dt, sycl_queue=q) + py_ones = ( + bool(1), + int(1), + float(1), + np.float32(1), + ctypes.c_int(1), + ) + for sc in py_ones: + R = dpt.remainder(X, sc) + assert isinstance(R, dpt.usm_ndarray) + R = dpt.remainder(sc, X) + assert isinstance(R, dpt.usm_ndarray) + + +@pytest.mark.parametrize("dtype", _no_complex_dtypes[1:]) +def test_remainder_inplace_python_scalar(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + X = dpt.ones((10, 10), dtype=dtype, sycl_queue=q) + dt_kind = X.dtype.kind + if dt_kind in "ui": + X %= int(1) + elif dt_kind == "f": + X %= float(1) + + +@pytest.mark.parametrize("op1_dtype", _no_complex_dtypes[1:]) +@pytest.mark.parametrize("op2_dtype", _no_complex_dtypes[1:]) +def test_remainder_inplace_dtype_matrix(op1_dtype, op2_dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(op1_dtype, q) + skip_if_dtype_not_supported(op2_dtype, q) + + sz = 127 + ar1 = dpt.ones(sz, dtype=op1_dtype) + ar2 = dpt.ones_like(ar1, dtype=op2_dtype) + + dev = q.sycl_device + _fp16 = dev.has_aspect_fp16 + _fp64 = dev.has_aspect_fp64 + if _can_cast(ar2.dtype, ar1.dtype, _fp16, _fp64, casting="same_kind"): + ar1 %= ar2 + assert dpt.all(ar1 == dpt.zeros(ar1.shape, dtype=ar1.dtype)) + + ar3 = dpt.ones(sz, dtype=op1_dtype) + ar4 = dpt.ones(2 * sz, dtype=op2_dtype) + + ar3[::-1] %= ar4[::2] + assert dpt.all(ar3 == dpt.zeros(ar3.shape, dtype=ar3.dtype)) + + else: + with pytest.raises(ValueError): + ar1 %= ar2 + + +def test_remainder_inplace_basic(): + get_queue_or_skip() + + x = dpt.arange(10, dtype="i4") + expected = x & 1 + x %= 2 + + assert dpt.all(x == expected) diff --git a/dpnp/tests/tensor/elementwise/test_round.py b/dpnp/tests/tensor/elementwise/test_round.py new file mode 100644 index 000000000000..5cfcb6dd598e --- /dev/null +++ b/dpnp/tests/tensor/elementwise/test_round.py @@ -0,0 +1,234 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import itertools + +import numpy as np +import pytest +from numpy.testing import assert_allclose, assert_array_equal + +import dpnp.tensor as dpt + +from ..helper import ( + get_queue_or_skip, + skip_if_dtype_not_supported, +) +from .utils import ( + _all_dtypes, + _map_to_device_dtype, + _usm_types, +) + + +@pytest.mark.parametrize("dtype", _all_dtypes[1:]) +def test_round_out_type(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + X = dpt.asarray(0.1, dtype=dtype, sycl_queue=q) + expected_dtype = np.round(np.array(0, dtype=dtype)).dtype + expected_dtype = _map_to_device_dtype(expected_dtype, q.sycl_device) + assert dpt.round(X).dtype == expected_dtype + + +@pytest.mark.parametrize("dtype", ["f2", "f4", "f8"]) +def test_round_real_contig(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + n_seq = 100 + n_rep = 137 + Xnp = np.linspace(0.01, 88.1, num=n_seq, dtype=dtype) + X = dpt.asarray(np.repeat(Xnp, n_rep), dtype=dtype, sycl_queue=q) + Y = dpt.round(X) + Ynp = np.round(Xnp) + + tol = 8 * dpt.finfo(dtype).resolution + assert_allclose(dpt.asnumpy(Y), np.repeat(Ynp, n_rep), atol=tol, rtol=tol) + + Z = dpt.empty_like(X, dtype=dtype) + dpt.round(X, out=Z) + + assert_allclose(dpt.asnumpy(Z), np.repeat(Ynp, n_rep), atol=tol, rtol=tol) + + +@pytest.mark.parametrize("dtype", ["c8", "c16"]) +def test_round_complex_contig(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + n_seq = 100 + n_rep = 137 + low = -88.0 + high = 88.0 + x1 = np.random.uniform(low=low, high=high, size=n_seq) + x2 = np.random.uniform(low=low, high=high, size=n_seq) + Xnp = np.array([complex(v1, v2) for v1, v2 in zip(x1, x2)], dtype=dtype) + + X = dpt.asarray(np.repeat(Xnp, n_rep), dtype=dtype, sycl_queue=q) + Y = dpt.round(X) + + tol = 8 * dpt.finfo(dtype).resolution + assert_allclose( + dpt.asnumpy(Y), np.repeat(np.round(Xnp), n_rep), atol=tol, rtol=tol + ) + + Z = dpt.empty_like(X, dtype=dtype) + dpt.round(X, out=Z) + + assert_allclose( + dpt.asnumpy(Z), np.repeat(np.round(Xnp), n_rep), atol=tol, rtol=tol + ) + + +@pytest.mark.parametrize("usm_type", _usm_types) +def test_round_usm_type(usm_type): + q = get_queue_or_skip() + + arg_dt = np.dtype("f4") + input_shape = (10, 10, 10, 10) + X = dpt.empty(input_shape, dtype=arg_dt, usm_type=usm_type, sycl_queue=q) + X[..., 0::2] = 16.2 + X[..., 1::2] = 23.7 + + Y = dpt.round(X) + assert Y.usm_type == X.usm_type + assert Y.sycl_queue == X.sycl_queue + assert Y.flags.c_contiguous + + expected_Y = np.empty(input_shape, dtype=arg_dt) + expected_Y[..., 0::2] = np.round(np.float32(16.2)) + expected_Y[..., 1::2] = np.round(np.float32(23.7)) + tol = 8 * dpt.finfo(Y.dtype).resolution + + assert_allclose(dpt.asnumpy(Y), expected_Y, atol=tol, rtol=tol) + + +@pytest.mark.parametrize("dtype", _all_dtypes) +def test_round_order(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + arg_dt = np.dtype(dtype) + input_shape = (10, 10, 10, 10) + X = dpt.empty(input_shape, dtype=arg_dt, sycl_queue=q) + X[..., 0::2] = 8.8 + X[..., 1::2] = 11.3 + + for perms in itertools.permutations(range(4)): + U = dpt.permute_dims(X[:, ::-1, ::-1, :], perms) + expected_Y = np.round(dpt.asnumpy(U)) + for ord in ["C", "F", "A", "K"]: + Y = dpt.round(U, order=ord) + assert_allclose(dpt.asnumpy(Y), expected_Y) + + +@pytest.mark.parametrize("dtype", ["f2", "f4", "f8"]) +def test_round_real_special_cases(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + tol = 8 * dpt.finfo(dtype).resolution + x = [np.nan, np.inf, -np.inf, 1.5, 2.5, -1.5, -2.5, 0.0, -0.0] + Xnp = np.array(x, dtype=dtype) + X = dpt.asarray(x, dtype=dtype) + + Y = dpt.asnumpy(dpt.round(X)) + Ynp = np.round(Xnp) + assert_allclose(Y, Ynp, atol=tol, rtol=tol) + assert_array_equal(np.signbit(Y), np.signbit(Ynp)) + + +@pytest.mark.parametrize("dtype", ["f2", "f4", "f8"]) +def test_round_real_strided(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + np.random.seed(42) + strides = np.array([-4, -3, -2, -1, 1, 2, 3, 4]) + sizes = [2, 4, 6, 8, 9, 24, 72] + tol = 8 * dpt.finfo(dtype).resolution + + for ii in sizes: + Xnp = np.random.uniform(low=0.01, high=88.1, size=ii) + Xnp.astype(dtype) + X = dpt.asarray(Xnp) + Ynp = np.round(Xnp) + for jj in strides: + assert_allclose( + dpt.asnumpy(dpt.round(X[::jj])), + Ynp[::jj], + atol=tol, + rtol=tol, + ) + + +@pytest.mark.parametrize("dtype", ["c8", "c16"]) +def test_round_complex_strided(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + np.random.seed(42) + strides = np.array([-4, -3, -2, -1, 1, 2, 3, 4]) + sizes = [2, 4, 6, 8, 9, 24, 72] + tol = 8 * dpt.finfo(dtype).resolution + + low = -88.0 + high = 88.0 + for ii in sizes: + x1 = np.random.uniform(low=low, high=high, size=ii) + x2 = np.random.uniform(low=low, high=high, size=ii) + Xnp = np.array([complex(v1, v2) for v1, v2 in zip(x1, x2)], dtype=dtype) + X = dpt.asarray(Xnp) + Ynp = np.round(Xnp) + for jj in strides: + assert_allclose( + dpt.asnumpy(dpt.round(X[::jj])), + Ynp[::jj], + atol=tol, + rtol=tol, + ) + + +@pytest.mark.parametrize("dtype", ["c8", "c16"]) +def test_round_complex_special_cases(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + x = [np.nan, np.inf, -np.inf, 1.5, 2.5, -1.5, -2.5, 0.0, -0.0] + xc = [complex(*val) for val in itertools.product(x, repeat=2)] + + Xc_np = np.array(xc, dtype=dtype) + Xc = dpt.asarray(Xc_np, dtype=dtype, sycl_queue=q) + + Ynp = np.round(Xc_np) + Y = dpt.round(Xc) + + tol = 8 * dpt.finfo(dtype).resolution + assert_allclose(dpt.asnumpy(dpt.real(Y)), np.real(Ynp), atol=tol, rtol=tol) + assert_allclose(dpt.asnumpy(dpt.imag(Y)), np.imag(Ynp), atol=tol, rtol=tol) diff --git a/dpnp/tests/tensor/elementwise/test_rsqrt.py b/dpnp/tests/tensor/elementwise/test_rsqrt.py new file mode 100644 index 000000000000..559de121e9be --- /dev/null +++ b/dpnp/tests/tensor/elementwise/test_rsqrt.py @@ -0,0 +1,93 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import numpy as np +import pytest +from numpy.testing import assert_allclose + +import dpnp.tensor as dpt + +from ..helper import ( + get_queue_or_skip, + skip_if_dtype_not_supported, +) +from .utils import ( + _map_to_device_dtype, + _no_complex_dtypes, + _real_fp_dtypes, +) + + +@pytest.mark.parametrize("dtype", _no_complex_dtypes) +def test_rsqrt_out_type(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + x = dpt.asarray(1, dtype=dtype, sycl_queue=q) + expected_dtype = np.reciprocal(np.sqrt(np.array(1, dtype=dtype))).dtype + expected_dtype = _map_to_device_dtype(expected_dtype, q.sycl_device) + assert dpt.rsqrt(x).dtype == expected_dtype + + +@pytest.mark.parametrize("dtype", _real_fp_dtypes) +def test_rsqrt_output_contig(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + n_seq = 1027 + + x = dpt.linspace(1, 13, num=n_seq, dtype=dtype, sycl_queue=q) + res = dpt.rsqrt(x) + expected = np.reciprocal(np.sqrt(dpt.asnumpy(x), dtype=dtype)) + tol = 8 * dpt.finfo(res.dtype).resolution + assert_allclose(dpt.asnumpy(res), expected, atol=tol, rtol=tol) + + +@pytest.mark.parametrize("dtype", _real_fp_dtypes) +def test_rsqrt_output_strided(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + n_seq = 2054 + + x = dpt.linspace(1, 13, num=n_seq, dtype=dtype, sycl_queue=q)[::-2] + res = dpt.rsqrt(x) + expected = np.reciprocal(np.sqrt(dpt.asnumpy(x), dtype=dtype)) + tol = 8 * dpt.finfo(res.dtype).resolution + assert_allclose(dpt.asnumpy(res), expected, atol=tol, rtol=tol) + + +def test_rsqrt_special_cases(): + get_queue_or_skip() + + x = dpt.asarray([dpt.nan, -1.0, 0.0, -0.0, dpt.inf, -dpt.inf], dtype="f4") + res = dpt.rsqrt(x) + expected = dpt.asarray( + [dpt.nan, dpt.nan, dpt.inf, -dpt.inf, 0.0, dpt.nan], dtype="f4" + ) + assert dpt.allclose(res, expected, equal_nan=True) diff --git a/dpnp/tests/tensor/elementwise/test_sign.py b/dpnp/tests/tensor/elementwise/test_sign.py new file mode 100644 index 000000000000..e2addb23b711 --- /dev/null +++ b/dpnp/tests/tensor/elementwise/test_sign.py @@ -0,0 +1,140 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import itertools + +import numpy as np +import pytest + +import dpnp.tensor as dpt + +from ..helper import ( + get_queue_or_skip, + skip_if_dtype_not_supported, +) +from .utils import ( + _all_dtypes, + _no_complex_dtypes, + _usm_types, +) + + +@pytest.mark.parametrize("dtype", _all_dtypes[1:]) +def test_sign_out_type(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + arg_dt = np.dtype(dtype) + X = dpt.asarray(0, dtype=arg_dt, sycl_queue=q) + assert dpt.sign(X).dtype == arg_dt + + r = dpt.empty_like(X, dtype=arg_dt) + dpt.sign(X, out=r) + assert np.allclose(dpt.asnumpy(r), dpt.asnumpy(dpt.sign(X))) + + +@pytest.mark.parametrize("usm_type", _usm_types) +def test_sign_usm_type(usm_type): + q = get_queue_or_skip() + + arg_dt = np.dtype("i4") + input_shape = (10, 10, 10, 10) + X = dpt.empty(input_shape, dtype=arg_dt, usm_type=usm_type, sycl_queue=q) + X[..., 0::2] = 1 + X[..., 1::2] = 0 + + Y = dpt.sign(X) + assert Y.usm_type == X.usm_type + assert Y.sycl_queue == X.sycl_queue + assert Y.flags.c_contiguous + + expected_Y = dpt.asnumpy(X) + assert np.allclose(dpt.asnumpy(Y), expected_Y) + + +@pytest.mark.parametrize("dtype", _all_dtypes[1:]) +def test_sign_order(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + arg_dt = np.dtype(dtype) + expected_dt = np.sign(np.ones(tuple(), dtype=arg_dt)).dtype + input_shape = (10, 10, 10, 10) + X = dpt.empty(input_shape, dtype=arg_dt, sycl_queue=q) + X[..., 0::2] = 1 + X[..., 1::2] = 0 + + for perms in itertools.permutations(range(4)): + U = dpt.permute_dims(X[:, ::-1, ::-1, :], perms) + expected_Y = np.ones(U.shape, dtype=expected_dt) + expected_Y[..., 1::2] = 0 + expected_Y = np.transpose(expected_Y, perms) + for ord in ["C", "F", "A", "K"]: + Y = dpt.sign(U, order=ord) + assert np.allclose(dpt.asnumpy(Y), expected_Y) + + +@pytest.mark.parametrize("dtype", ["c8", "c16"]) +def test_sign_complex(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + arg_dt = np.dtype(dtype) + input_shape = (10, 10, 10, 10) + X = dpt.empty(input_shape, dtype=arg_dt, sycl_queue=q) + Xnp = np.random.standard_normal( + size=input_shape + ) + 1j * np.random.standard_normal(size=input_shape) + Xnp = Xnp.astype(arg_dt) + X[...] = Xnp + + for ord in ["C", "F", "A", "K"]: + for perms in itertools.permutations(range(4)): + U = dpt.permute_dims(X[:, ::-1, ::-1, :], perms) + Y = dpt.sign(U, order=ord) + X_t = np.transpose(Xnp[:, ::-1, ::-1, :], perms) + expected_Y = X_t / np.abs(X_t) + tol = dpt.finfo(Y.dtype).resolution + np.testing.assert_allclose( + dpt.asnumpy(Y), expected_Y, atol=tol, rtol=tol + ) + + +# test for all signed real data types +@pytest.mark.parametrize( + "dt", _no_complex_dtypes[1:8:2] + _no_complex_dtypes[9:] +) +def test_sign_negative(dt): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dt, q) + + x = dpt.arange(-20, 20, 1, dtype=dt, sycl_queue=q) + x_np = np.arange(-20, 20, 1, dtype=dt) + res = dpt.sign(x) + + assert (dpt.asnumpy(res) == np.sign(x_np)).all() diff --git a/dpnp/tests/tensor/elementwise/test_signbit.py b/dpnp/tests/tensor/elementwise/test_signbit.py new file mode 100644 index 000000000000..9006bcafbd2d --- /dev/null +++ b/dpnp/tests/tensor/elementwise/test_signbit.py @@ -0,0 +1,124 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import numpy as np +import pytest + +import dpnp.tensor as dpt + +from ..helper import ( + get_queue_or_skip, + skip_if_dtype_not_supported, +) + + +@pytest.mark.parametrize("dtype", ["f2", "f4", "f8"]) +def test_signbit_out_type_contig(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + arg_dt = np.dtype(dtype) + x = dpt.linspace(1, 10, num=256, dtype=arg_dt) + sb = dpt.signbit(x) + assert sb.dtype == dpt.bool + + assert not dpt.any(sb) + + x2 = dpt.linspace(-10, -1, num=256, dtype=arg_dt) + sb2 = dpt.signbit(x2) + assert dpt.all(sb2) + + +@pytest.mark.parametrize("dtype", ["f2", "f4", "f8"]) +def test_signbit_out_type_strided(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + arg_dt = np.dtype(dtype) + x = dpt.linspace(1, 10, num=256, dtype=arg_dt) + sb = dpt.signbit(x[::-3]) + assert sb.dtype == dpt.bool + + assert not dpt.any(sb) + + x2 = dpt.linspace(-10, -1, num=256, dtype=arg_dt) + sb2 = dpt.signbit(x2[::-3]) + assert dpt.all(sb2) + + +@pytest.mark.parametrize("dtype", ["f2", "f4", "f8"]) +def test_signbit_special_cases_contig(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + arg_dt = np.dtype(dtype) + n = 63 + x1 = dpt.full(n, -dpt.inf, dtype=arg_dt) + x2 = dpt.full(n, -0.0, dtype=arg_dt) + x3 = dpt.full(n, 0.0, dtype=arg_dt) + x4 = dpt.full(n, dpt.inf, dtype=arg_dt) + + x = dpt.concat((x1, x2, x3, x4)) + actual = dpt.signbit(x) + + expected = dpt.concat( + ( + dpt.full(x1.size, True), + dpt.full(x2.size, True), + dpt.full(x3.size, False), + dpt.full(x4.size, False), + ) + ) + + assert dpt.all(dpt.equal(actual, expected)) + + +@pytest.mark.parametrize("dtype", ["f2", "f4", "f8"]) +def test_signbit_special_cases_strided(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + arg_dt = np.dtype(dtype) + x1 = dpt.full(63, -dpt.inf, dtype=arg_dt) + x2 = dpt.full(63, -0.0, dtype=arg_dt) + x3 = dpt.full(63, 0.0, dtype=arg_dt) + x4 = dpt.full(63, dpt.inf, dtype=arg_dt) + + x = dpt.concat((x1, x2, x3, x4)) + actual = dpt.signbit(x[::-1]) + + expected = dpt.concat( + ( + dpt.full(x4.size, False), + dpt.full(x3.size, False), + dpt.full(x2.size, True), + dpt.full(x1.size, True), + ) + ) + + assert dpt.all(dpt.equal(actual, expected)) diff --git a/dpnp/tests/tensor/elementwise/test_sqrt.py b/dpnp/tests/tensor/elementwise/test_sqrt.py new file mode 100644 index 000000000000..d6bc7a42434e --- /dev/null +++ b/dpnp/tests/tensor/elementwise/test_sqrt.py @@ -0,0 +1,207 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import itertools +import warnings + +import numpy as np +import pytest +from numpy.testing import assert_allclose, assert_equal + +import dpnp.tensor as dpt + +from ..helper import ( + get_queue_or_skip, + skip_if_dtype_not_supported, +) +from .utils import ( + _all_dtypes, + _complex_fp_dtypes, + _map_to_device_dtype, + _real_fp_dtypes, + _usm_types, +) + + +@pytest.mark.parametrize("dtype", _all_dtypes) +def test_sqrt_out_type(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + X = dpt.asarray(0, dtype=dtype, sycl_queue=q) + expected_dtype = np.sqrt(np.array(0, dtype=dtype)).dtype + expected_dtype = _map_to_device_dtype(expected_dtype, q.sycl_device) + assert dpt.sqrt(X).dtype == expected_dtype + + +@pytest.mark.parametrize("dtype", ["f2", "f4", "f8", "c8", "c16"]) +def test_sqrt_output_contig(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + n_seq = 1027 + + X = dpt.linspace(0, 13, num=n_seq, dtype=dtype, sycl_queue=q) + Xnp = dpt.asnumpy(X) + + Y = dpt.sqrt(X) + tol = 8 * dpt.finfo(Y.dtype).resolution + + assert_allclose(dpt.asnumpy(Y), np.sqrt(Xnp), atol=tol, rtol=tol) + + +@pytest.mark.parametrize("dtype", ["f2", "f4", "f8", "c8", "c16"]) +def test_sqrt_output_strided(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + n_seq = 2054 + + X = dpt.linspace(0, 13, num=n_seq, dtype=dtype, sycl_queue=q)[::-2] + Xnp = dpt.asnumpy(X) + + Y = dpt.sqrt(X) + tol = 8 * dpt.finfo(Y.dtype).resolution + + assert_allclose(dpt.asnumpy(Y), np.sqrt(Xnp), atol=tol, rtol=tol) + + +@pytest.mark.parametrize("usm_type", _usm_types) +def test_sqrt_usm_type(usm_type): + q = get_queue_or_skip() + + arg_dt = np.dtype("f4") + input_shape = (10, 10, 10, 10) + X = dpt.empty(input_shape, dtype=arg_dt, usm_type=usm_type, sycl_queue=q) + X[..., 0::2] = 16.0 + X[..., 1::2] = 23.0 + + Y = dpt.sqrt(X) + assert Y.usm_type == X.usm_type + assert Y.sycl_queue == X.sycl_queue + assert Y.flags.c_contiguous + + expected_Y = np.empty(input_shape, dtype=arg_dt) + expected_Y[..., 0::2] = np.sqrt(np.float32(16.0)) + expected_Y[..., 1::2] = np.sqrt(np.float32(23.0)) + tol = 8 * dpt.finfo(Y.dtype).resolution + + assert_allclose(dpt.asnumpy(Y), expected_Y, atol=tol, rtol=tol) + + +@pytest.mark.parametrize("dtype", _all_dtypes) +def test_sqrt_order(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + arg_dt = np.dtype(dtype) + input_shape = (10, 10, 10, 10) + X = dpt.empty(input_shape, dtype=arg_dt, sycl_queue=q) + X[..., 0::2] = 16.0 + X[..., 1::2] = 23.0 + + for perms in itertools.permutations(range(4)): + U = dpt.permute_dims(X[:, ::-1, ::-1, :], perms) + expected_Y = np.sqrt(dpt.asnumpy(U)) + for ord in ["C", "F", "A", "K"]: + Y = dpt.sqrt(U, order=ord) + tol = 8 * max( + dpt.finfo(Y.dtype).resolution, + np.finfo(expected_Y.dtype).resolution, + ) + assert_allclose(dpt.asnumpy(Y), expected_Y, atol=tol, rtol=tol) + + +@pytest.mark.usefixtures("suppress_invalid_numpy_warnings") +def test_sqrt_special_cases(): + q = get_queue_or_skip() + + X = dpt.asarray( + [dpt.nan, -1.0, 0.0, -0.0, dpt.inf, -dpt.inf], dtype="f4", sycl_queue=q + ) + Xnp = dpt.asnumpy(X) + + assert_equal(dpt.asnumpy(dpt.sqrt(X)), np.sqrt(Xnp)) + + +@pytest.mark.parametrize("dtype", _real_fp_dtypes) +def test_sqrt_real_fp_special_values(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + nans_ = [dpt.nan, -dpt.nan] + infs_ = [dpt.inf, -dpt.inf] + finites_ = [-1.0, -0.0, 0.0, 1.0] + inps_ = nans_ + infs_ + finites_ + + x = dpt.asarray(inps_, dtype=dtype) + r = dpt.sqrt(x) + + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + expected_np = np.sqrt(np.asarray(inps_, dtype=dtype)) + + expected = dpt.asarray(expected_np, dtype=dtype) + tol = dpt.finfo(r.dtype).resolution + + assert dpt.allclose(r, expected, atol=tol, rtol=tol, equal_nan=True) + + +@pytest.mark.parametrize("dtype", _complex_fp_dtypes) +def test_sqrt_complex_fp_special_values(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + nans_ = [dpt.nan, -dpt.nan] + infs_ = [dpt.inf, -dpt.inf] + finites_ = [-1.0, -0.0, 0.0, 1.0] + inps_ = nans_ + infs_ + finites_ + c_ = [complex(*v) for v in itertools.product(inps_, repeat=2)] + + z = dpt.asarray(c_, dtype=dtype) + r = dpt.sqrt(z) + + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + expected_np = np.sqrt(np.asarray(c_, dtype=dtype)) + + expected = dpt.asarray(expected_np, dtype=dtype) + tol = dpt.finfo(r.dtype).resolution + + if not dpt.allclose(r, expected, atol=tol, rtol=tol, equal_nan=True): + for i in range(r.shape[0]): + failure_data = [] + if not dpt.allclose( + r[i], expected[i], atol=tol, rtol=tol, equal_nan=True + ): + msg = ( + f"Test failed for input {z[i]}, i.e. {c_[i]} for index {i}" + ) + msg += f", results were {r[i]} vs. {expected[i]}" + failure_data.extend(msg) + pytest.skip(reason=msg) diff --git a/dpnp/tests/tensor/elementwise/test_square.py b/dpnp/tests/tensor/elementwise/test_square.py new file mode 100644 index 000000000000..0b65e9af53ce --- /dev/null +++ b/dpnp/tests/tensor/elementwise/test_square.py @@ -0,0 +1,114 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import itertools + +import numpy as np +import pytest + +import dpnp.tensor as dpt + +from ..helper import ( + get_queue_or_skip, + skip_if_dtype_not_supported, +) +from .utils import _all_dtypes, _usm_types + + +@pytest.mark.parametrize("dtype", _all_dtypes[1:]) +def test_square_out_type(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + arg_dt = np.dtype(dtype) + X = dpt.arange(5, dtype=arg_dt, sycl_queue=q) + assert dpt.square(X).dtype == arg_dt + + r = dpt.empty_like(X, dtype=arg_dt) + dpt.square(X, out=r) + assert np.allclose(dpt.asnumpy(r), dpt.asnumpy(dpt.square(X))) + + +@pytest.mark.parametrize("usm_type", _usm_types) +def test_square_usm_type(usm_type): + q = get_queue_or_skip() + + arg_dt = np.dtype("i4") + input_shape = (10, 10, 10, 10) + X = dpt.empty(input_shape, dtype=arg_dt, usm_type=usm_type, sycl_queue=q) + X[..., 0::2] = 1 + X[..., 1::2] = 0 + + Y = dpt.square(X) + assert Y.usm_type == X.usm_type + assert Y.sycl_queue == X.sycl_queue + assert Y.flags.c_contiguous + + expected_Y = dpt.asnumpy(X) + assert np.allclose(dpt.asnumpy(Y), expected_Y) + + +@pytest.mark.parametrize("dtype", _all_dtypes[1:]) +def test_square_order(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + arg_dt = np.dtype(dtype) + input_shape = (10, 10, 10, 10) + X = dpt.empty(input_shape, dtype=arg_dt, sycl_queue=q) + X[..., 0::2] = 2 + X[..., 1::2] = 0 + + for perms in itertools.permutations(range(4)): + U = dpt.permute_dims(X[:, ::-1, ::-1, :], perms) + expected_Y = np.full(U.shape, 4, dtype=U.dtype) + expected_Y[..., 1::2] = 0 + expected_Y = np.transpose(expected_Y, perms) + for ord in ["C", "F", "A", "K"]: + Y = dpt.square(U, order=ord) + assert np.allclose(dpt.asnumpy(Y), expected_Y) + + +@pytest.mark.parametrize("dtype", ["c8", "c16"]) +def test_square_special_cases(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + vals = [np.nan, np.inf, -np.inf, 0.0, -0.0] + X = dpt.asarray(vals, dtype=dtype, sycl_queue=q) + X_np = dpt.asnumpy(X) + + tol = 8 * dpt.finfo(dtype).resolution + with np.errstate(all="ignore"): + assert np.allclose( + dpt.asnumpy(dpt.square(X)), + np.square(X_np), + atol=tol, + rtol=tol, + equal_nan=True, + ) diff --git a/dpnp/tests/tensor/elementwise/test_subtract.py b/dpnp/tests/tensor/elementwise/test_subtract.py new file mode 100644 index 000000000000..70d05f926c23 --- /dev/null +++ b/dpnp/tests/tensor/elementwise/test_subtract.py @@ -0,0 +1,252 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import ctypes + +import dpctl +import numpy as np +import pytest + +import dpnp.tensor as dpt +from dpnp.tensor._type_utils import _can_cast + +from ..helper import ( + get_queue_or_skip, + skip_if_dtype_not_supported, +) +from .utils import ( + _all_dtypes, + _compare_dtypes, + _usm_types, +) + + +@pytest.mark.parametrize("op1_dtype", _all_dtypes[1:]) +@pytest.mark.parametrize("op2_dtype", _all_dtypes[1:]) +def test_subtract_dtype_matrix(op1_dtype, op2_dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(op1_dtype, q) + skip_if_dtype_not_supported(op2_dtype, q) + + sz = 127 + ar1 = dpt.ones(sz, dtype=op1_dtype) + ar2 = dpt.ones_like(ar1, dtype=op2_dtype) + + r = dpt.subtract(ar1, ar2) + assert isinstance(r, dpt.usm_ndarray) + expected_dtype = np.subtract( + np.zeros(1, dtype=op1_dtype), np.zeros(1, dtype=op2_dtype) + ).dtype + assert _compare_dtypes(r.dtype, expected_dtype, sycl_queue=q) + assert r.shape == ar1.shape + assert (dpt.asnumpy(r) == np.full(r.shape, 0, dtype=r.dtype)).all() + assert r.sycl_queue == ar1.sycl_queue + + r2 = dpt.empty_like(ar1, dtype=r.dtype) + dpt.subtract(ar1, ar2, out=r2) + assert (dpt.asnumpy(r2) == np.full(r2.shape, 0, dtype=r2.dtype)).all() + + ar3 = dpt.ones(sz, dtype=op1_dtype) + ar4 = dpt.ones(2 * sz, dtype=op2_dtype) + + r = dpt.subtract(ar3[::-1], ar4[::2]) + assert isinstance(r, dpt.usm_ndarray) + expected_dtype = np.subtract( + np.zeros(1, dtype=op1_dtype), np.zeros(1, dtype=op2_dtype) + ).dtype + assert _compare_dtypes(r.dtype, expected_dtype, sycl_queue=q) + assert r.shape == ar3.shape + assert (dpt.asnumpy(r) == np.full(r.shape, 0, dtype=r.dtype)).all() + + r2 = dpt.empty_like(ar1, dtype=r.dtype) + dpt.subtract(ar3[::-1], ar4[::2], out=r2) + assert (dpt.asnumpy(r2) == np.full(r2.shape, 0, dtype=r2.dtype)).all() + + +def test_subtract_bool(): + get_queue_or_skip() + ar1 = dpt.ones(127, dtype="?") + ar2 = dpt.ones_like(ar1, dtype="?") + with pytest.raises(ValueError): + dpt.subtract(ar1, ar2) + + +@pytest.mark.parametrize("op1_usm_type", _usm_types) +@pytest.mark.parametrize("op2_usm_type", _usm_types) +def test_subtract_usm_type_matrix(op1_usm_type, op2_usm_type): + get_queue_or_skip() + + sz = 128 + ar1 = dpt.ones(sz, dtype="i4", usm_type=op1_usm_type) + ar2 = dpt.ones_like(ar1, dtype="i4", usm_type=op2_usm_type) + + r = dpt.subtract(ar1, ar2) + assert isinstance(r, dpt.usm_ndarray) + expected_usm_type = dpt.get_coerced_usm_type((op1_usm_type, op2_usm_type)) + assert r.usm_type == expected_usm_type + + +def test_subtract_order(): + get_queue_or_skip() + + test_shape = ( + 20, + 20, + ) + test_shape2 = tuple(2 * dim for dim in test_shape) + n = test_shape[-1] + + for dt1, dt2 in zip(["i4", "i4", "f4"], ["i4", "f4", "i4"]): + ar1 = dpt.ones(test_shape, dtype=dt1, order="C") + ar2 = dpt.ones(test_shape, dtype=dt2, order="C") + r1 = dpt.subtract(ar1, ar2, order="C") + assert r1.flags.c_contiguous + r2 = dpt.subtract(ar1, ar2, order="F") + assert r2.flags.f_contiguous + r3 = dpt.subtract(ar1, ar2, order="A") + assert r3.flags.c_contiguous + r4 = dpt.subtract(ar1, ar2, order="K") + assert r4.flags.c_contiguous + + ar1 = dpt.ones(test_shape, dtype=dt1, order="F") + ar2 = dpt.ones(test_shape, dtype=dt2, order="F") + r1 = dpt.subtract(ar1, ar2, order="C") + assert r1.flags.c_contiguous + r2 = dpt.subtract(ar1, ar2, order="F") + assert r2.flags.f_contiguous + r3 = dpt.subtract(ar1, ar2, order="A") + assert r3.flags.f_contiguous + r4 = dpt.subtract(ar1, ar2, order="K") + assert r4.flags.f_contiguous + + ar1 = dpt.ones(test_shape2, dtype=dt1, order="C")[:20, ::-2] + ar2 = dpt.ones(test_shape2, dtype=dt2, order="C")[:20, ::-2] + r4 = dpt.subtract(ar1, ar2, order="K") + assert r4.strides == (n, -1) + r5 = dpt.subtract(ar1, ar2, order="C") + assert r5.strides == (n, 1) + + ar1 = dpt.ones(test_shape2, dtype=dt1, order="C")[:20, ::-2].mT + ar2 = dpt.ones(test_shape2, dtype=dt2, order="C")[:20, ::-2].mT + r4 = dpt.subtract(ar1, ar2, order="K") + assert r4.strides == (-1, n) + r5 = dpt.subtract(ar1, ar2, order="C") + assert r5.strides == (n, 1) + + +def test_subtract_broadcasting(): + get_queue_or_skip() + + m = dpt.ones((100, 5), dtype="i4") + v = dpt.arange(5, dtype="i4") + + r = dpt.subtract(m, v) + assert ( + dpt.asnumpy(r) == np.arange(1, -4, step=-1, dtype="i4")[np.newaxis, :] + ).all() + + r2 = dpt.subtract(v, m) + assert ( + dpt.asnumpy(r2) == np.arange(-1, 4, dtype="i4")[np.newaxis, :] + ).all() + + +@pytest.mark.parametrize("arr_dt", _all_dtypes[1:]) +def test_subtract_python_scalar(arr_dt): + q = get_queue_or_skip() + skip_if_dtype_not_supported(arr_dt, q) + + X = dpt.zeros((10, 10), dtype=arr_dt, sycl_queue=q) + py_zeros = ( + bool(0), + int(0), + float(0), + complex(0), + np.float32(0), + ctypes.c_int(0), + ) + for sc in py_zeros: + R = dpt.subtract(X, sc) + assert isinstance(R, dpt.usm_ndarray) + R = dpt.subtract(sc, X) + assert isinstance(R, dpt.usm_ndarray) + + +@pytest.mark.parametrize("dtype", _all_dtypes[1:]) +def test_subtract_inplace_python_scalar(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + X = dpt.zeros((10, 10), dtype=dtype, sycl_queue=q) + dt_kind = X.dtype.kind + if dt_kind in "ui": + X -= int(0) + elif dt_kind == "f": + X -= float(0) + elif dt_kind == "c": + X -= complex(0) + + +@pytest.mark.parametrize("op1_dtype", _all_dtypes[1:]) +@pytest.mark.parametrize("op2_dtype", _all_dtypes[1:]) +def test_subtract_inplace_dtype_matrix(op1_dtype, op2_dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(op1_dtype, q) + skip_if_dtype_not_supported(op2_dtype, q) + + sz = 127 + ar1 = dpt.ones(sz, dtype=op1_dtype) + ar2 = dpt.ones_like(ar1, dtype=op2_dtype) + + dev = q.sycl_device + _fp16 = dev.has_aspect_fp16 + _fp64 = dev.has_aspect_fp64 + if _can_cast(ar2.dtype, ar1.dtype, _fp16, _fp64, casting="same_kind"): + ar1 -= ar2 + assert (dpt.asnumpy(ar1) == np.zeros(ar1.shape, dtype=ar1.dtype)).all() + + ar3 = dpt.ones(sz, dtype=op1_dtype) + ar4 = dpt.ones(2 * sz, dtype=op2_dtype) + + ar3[::-1] -= ar4[::2] + assert (dpt.asnumpy(ar3) == np.zeros(ar3.shape, dtype=ar3.dtype)).all() + + else: + with pytest.raises(ValueError): + ar1 -= ar2 + + +def test_subtract_inplace_broadcasting(): + get_queue_or_skip() + + m = dpt.ones((100, 5), dtype="i4") + v = dpt.arange(5, dtype="i4") + + m -= v + assert ( + dpt.asnumpy(m) == np.arange(1, -4, step=-1, dtype="i4")[np.newaxis, :] + ).all() diff --git a/dpnp/tests/tensor/elementwise/test_trigonometric.py b/dpnp/tests/tensor/elementwise/test_trigonometric.py new file mode 100644 index 000000000000..497432360306 --- /dev/null +++ b/dpnp/tests/tensor/elementwise/test_trigonometric.py @@ -0,0 +1,234 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import numpy as np +import pytest +from numpy.testing import assert_allclose + +import dpnp.tensor as dpt + +from ..helper import ( + get_queue_or_skip, + skip_if_dtype_not_supported, +) +from .utils import ( + _all_dtypes, + _map_to_device_dtype, +) + +_trig_funcs = [(np.sin, dpt.sin), (np.cos, dpt.cos), (np.tan, dpt.tan)] +_inv_trig_funcs = [ + (np.arcsin, dpt.asin), + (np.arccos, dpt.acos), + (np.arctan, dpt.atan), +] +_all_funcs = _trig_funcs + _inv_trig_funcs + + +@pytest.mark.parametrize("np_call, dpt_call", _all_funcs) +@pytest.mark.parametrize("dtype", _all_dtypes) +def test_trig_out_type(np_call, dpt_call, dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + x = dpt.asarray(0, dtype=dtype, sycl_queue=q) + expected_dtype = np_call(np.array(0, dtype=dtype)).dtype + expected_dtype = _map_to_device_dtype(expected_dtype, q.sycl_device) + assert dpt_call(x).dtype == expected_dtype + + +@pytest.mark.parametrize("np_call, dpt_call", _all_funcs) +@pytest.mark.parametrize("dtype", ["f2", "f4", "f8"]) +def test_trig_real_contig(np_call, dpt_call, dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + n_seq = 100 + n_rep = 137 + if np_call in _trig_funcs: + Xnp = np.linspace( + -np.pi / 2 * 0.99, np.pi / 2 * 0.99, num=n_seq, dtype=dtype + ) + if np_call == np.arctan: + Xnp = np.linspace(-100.0, 100.0, num=n_seq, dtype=dtype) + else: + Xnp = np.linspace(-1.0, 1.0, num=n_seq, dtype=dtype) + + X = dpt.asarray(np.repeat(Xnp, n_rep), dtype=dtype, sycl_queue=q) + Y = dpt_call(X) + + tol = 8 * dpt.finfo(dtype).resolution + assert_allclose( + dpt.asnumpy(Y), np.repeat(np_call(Xnp), n_rep), atol=tol, rtol=tol + ) + + Z = dpt.empty_like(X, dtype=dtype) + dpt_call(X, out=Z) + + assert_allclose( + dpt.asnumpy(Z), np.repeat(np_call(Xnp), n_rep), atol=tol, rtol=tol + ) + + +@pytest.mark.parametrize("np_call, dpt_call", _all_funcs) +@pytest.mark.parametrize("dtype", ["c8", "c16"]) +def test_trig_complex_contig(np_call, dpt_call, dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + n_seq = 256 + n_rep = 137 + low = -9.0 + high = 9.0 + x1 = np.random.uniform(low=low, high=high, size=n_seq) + x2 = np.random.uniform(low=low, high=high, size=n_seq) + Xnp = x1 + 1j * x2 + + # stay away from poles and branch lines + modulus = np.abs(Xnp) + sel = np.logical_or( + modulus < 0.9, + np.logical_and( + modulus > 1.2, np.minimum(np.abs(x2), np.abs(x1)) > 0.05 + ), + ) + Xnp = Xnp[sel] + + X = dpt.repeat(dpt.asarray(Xnp, dtype=dtype, sycl_queue=q), n_rep) + Y = dpt_call(X) + + expected = np.repeat(np_call(Xnp.astype(dtype)), n_rep) + + tol = 50 * dpt.finfo(dtype).resolution + assert_allclose(dpt.asnumpy(Y), expected, atol=tol, rtol=tol) + + Z = dpt.empty_like(X, dtype=dtype) + dpt_call(X, out=Z) + + assert_allclose(dpt.asnumpy(Z), expected, atol=tol, rtol=tol) + + +@pytest.mark.parametrize("np_call, dpt_call", _all_funcs) +@pytest.mark.parametrize("dtype", ["f2", "f4", "f8"]) +def test_trig_real_strided(np_call, dpt_call, dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + np.random.seed(42) + strides = np.array([-4, -3, -2, -1, 1, 2, 3, 4]) + sizes = [2, 3, 4, 6, 8, 9, 24, 50, 72] + tol = 8 * dpt.finfo(dtype).resolution + + low = -100.0 + high = 100.0 + if np_call in [np.arccos, np.arcsin]: + low = -1.0 + high = 1.0 + elif np_call in [np.tan]: + low = -np.pi / 2 * (0.99) + high = np.pi / 2 * (0.99) + + for ii in sizes: + Xnp = np.random.uniform(low=low, high=high, size=ii) + Xnp.astype(dtype) + X = dpt.asarray(Xnp) + Ynp = np_call(Xnp) + for jj in strides: + assert_allclose( + dpt.asnumpy(dpt_call(X[::jj])), + Ynp[::jj], + atol=tol, + rtol=tol, + ) + + +@pytest.mark.parametrize("np_call, dpt_call", _all_funcs) +@pytest.mark.parametrize("dtype", ["c8", "c16"]) +def test_trig_complex_strided(np_call, dpt_call, dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + np.random.seed(42) + strides = np.array([-4, -3, -2, -1, 1, 2, 3, 4]) + sizes = [2, 4, 6, 8, 9, 24, 72] + tol = 50 * dpt.finfo(dtype).resolution + + low = -9.0 + high = 9.0 + while True: + x1 = np.random.uniform(low=low, high=high, size=2 * sum(sizes)) + x2 = np.random.uniform(low=low, high=high, size=2 * sum(sizes)) + Xnp_all = np.array( + [complex(v1, v2) for v1, v2 in zip(x1, x2)], dtype=dtype + ) + + # stay away from poles and branch lines + modulus = np.abs(Xnp_all) + sel = np.logical_or( + modulus < 0.9, + np.logical_and( + modulus > 1.2, np.minimum(np.abs(x2), np.abs(x1)) > 0.05 + ), + ) + Xnp_all = Xnp_all[sel] + if Xnp_all.size > sum(sizes): + break + + pos = 0 + for ii in sizes: + pos = pos + ii + Xnp = Xnp_all[:pos] + Xnp = Xnp[-ii:] + X = dpt.asarray(Xnp) + Ynp = np_call(Xnp) + for jj in strides: + assert_allclose( + dpt.asnumpy(dpt_call(X[::jj])), + Ynp[::jj], + atol=tol, + rtol=tol, + ) + + +@pytest.mark.parametrize("np_call, dpt_call", _all_funcs) +@pytest.mark.parametrize("dtype", ["f2", "f4", "f8"]) +def test_trig_real_special_cases(np_call, dpt_call, dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + x = [np.nan, np.inf, -np.inf, 2.0, -2.0, +0.0, -0.0, +1.0, -1.0] + + xf = np.array(x, dtype=dtype) + yf = dpt.asarray(xf, dtype=dtype, sycl_queue=q) + + with np.errstate(all="ignore"): + Y_np = np_call(xf) + + tol = 8 * dpt.finfo(dtype).resolution + Y = dpt_call(yf) + assert_allclose(dpt.asnumpy(Y), Y_np, atol=tol, rtol=tol) diff --git a/dpnp/tests/tensor/elementwise/test_type_utils.py b/dpnp/tests/tensor/elementwise/test_type_utils.py new file mode 100644 index 000000000000..42e096f4f42d --- /dev/null +++ b/dpnp/tests/tensor/elementwise/test_type_utils.py @@ -0,0 +1,254 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import dpctl +import numpy as np +import pytest + +import dpnp.tensor as dpt +import dpnp.tensor._type_utils as tu + +from .utils import ( + _all_dtypes, + _map_to_device_dtype, +) + + +class MockDevice: + def __init__(self, fp16: bool, fp64: bool): + self.has_aspect_fp16 = fp16 + self.has_aspect_fp64 = fp64 + + +@pytest.mark.parametrize("dtype", _all_dtypes) +def test_type_utils_map_to_device_type(dtype): + for fp64 in [ + True, + False, + ]: + for fp16 in [True, False]: + dev = MockDevice(fp16, fp64) + dt_in = dpt.dtype(dtype) + dt_out = _map_to_device_dtype(dt_in, dev) + assert isinstance(dt_out, dpt.dtype) + + +def test_type_util_all_data_types(): + for fp64 in [ + True, + False, + ]: + for fp16 in [True, False]: + r = tu._all_data_types(fp16, fp64) + assert isinstance(r, list) + # 11: bool + 4 signed + 4 unsigned inegral + float32 + complex64 + assert len(r) == 11 + int(fp16) + 2 * int(fp64) + + +def test_type_util_can_cast(): + for fp64 in [ + True, + False, + ]: + for fp16 in [True, False]: + for from_ in _all_dtypes: + for to_ in _all_dtypes: + r = tu._can_cast( + dpt.dtype(from_), dpt.dtype(to_), fp16, fp64 + ) + assert isinstance(r, bool) + + +def test_type_utils_find_buf_dtype(): + def _denier_fn(dt): + return False + + for fp64 in [ + True, + False, + ]: + for fp16 in [True, False]: + dev = MockDevice(fp16, fp64) + arg_dt = dpt.float64 + r = tu._find_buf_dtype( + arg_dt, _denier_fn, dev, tu._acceptance_fn_default_unary + ) + assert r == ( + None, + None, + ) + + +def test_type_utils_get_device_default_type(): + with pytest.raises(RuntimeError): + tu._get_device_default_dtype("-", MockDevice(True, True)) + try: + dev = dpctl.SyclDevice() + except dpctl.SyclDeviceCreationError: + pytest.skip("No SYCL devices available") + for k in ["b", "i", "u", "f", "c"]: + dt = tu._get_device_default_dtype(k, dev) + assert isinstance(dt, dpt.dtype) + assert dt.kind == k + + +def test_type_utils_find_buf_dtype2(): + def _denier_fn(dt1, dt2): + return False + + for fp64 in [ + True, + False, + ]: + for fp16 in [True, False]: + dev = MockDevice(fp16, fp64) + arg1_dt = dpt.float64 + arg2_dt = dpt.complex64 + r = tu._find_buf_dtype2( + arg1_dt, + arg2_dt, + _denier_fn, + dev, + tu._acceptance_fn_default_binary, + ) + assert r == ( + None, + None, + None, + ) + + +def test_unary_func_arg_validation(): + with pytest.raises(TypeError): + dpt.abs([1, 2, 3]) + try: + a = dpt.arange(8) + except dpctl.SyclDeviceCreationError: + pytest.skip("No SYCL devices available") + dpt.abs(a, order="invalid") + + +def test_binary_func_arg_validation(): + with pytest.raises(dpt.ExecutionPlacementError): + dpt.add([1, 2, 3], 1) + try: + a = dpt.arange(8) + except dpctl.SyclDeviceCreationError: + pytest.skip("No SYCL devices available") + with pytest.raises(ValueError): + dpt.add(a, Ellipsis) + dpt.add(a, a, order="invalid") + + +def test_all_data_types(): + fp16_fp64_types = {dpt.float16, dpt.float64, dpt.complex128} + fp64_types = {dpt.float64, dpt.complex128} + + all_dts = tu._all_data_types(True, True) + assert fp16_fp64_types.issubset(all_dts) + + all_dts = tu._all_data_types(True, False) + assert dpt.float16 in all_dts + assert not fp64_types.issubset(all_dts) + + all_dts = tu._all_data_types(False, True) + assert dpt.float16 not in all_dts + assert fp64_types.issubset(all_dts) + + all_dts = tu._all_data_types(False, False) + assert not fp16_fp64_types.issubset(all_dts) + + +@pytest.mark.parametrize("fp16", [True, False]) +@pytest.mark.parametrize("fp64", [True, False]) +def test_maximal_inexact_types(fp16, fp64): + assert not tu._is_maximal_inexact_type(dpt.int32, fp16, fp64) + assert fp64 == tu._is_maximal_inexact_type(dpt.float64, fp16, fp64) + assert fp64 == tu._is_maximal_inexact_type(dpt.complex128, fp16, fp64) + assert fp64 != tu._is_maximal_inexact_type(dpt.float32, fp16, fp64) + assert fp64 != tu._is_maximal_inexact_type(dpt.complex64, fp16, fp64) + + +def test_can_cast_device(): + assert tu._can_cast(dpt.int64, dpt.float64, True, True) + # if f8 is available, can't cast i8 to f4 + assert not tu._can_cast(dpt.int64, dpt.float32, True, True) + assert not tu._can_cast(dpt.int64, dpt.float32, False, True) + # should be able to cast to f8 when f2 unavailable + assert tu._can_cast(dpt.int64, dpt.float64, False, True) + # casting to f4 acceptable when f8 unavailable + assert tu._can_cast(dpt.int64, dpt.float32, True, False) + assert tu._can_cast(dpt.int64, dpt.float32, False, False) + # can't safely cast inexact type to inexact type of lesser precision + assert not tu._can_cast(dpt.float32, dpt.float16, True, False) + assert not tu._can_cast(dpt.float64, dpt.float32, False, True) + + +def test_acceptance_fns(): + """Check type promotion acceptance functions""" + try: + dev = dpctl.SyclDevice() + except dpctl.SyclDeviceCreationError: + pytest.skip("Default device is not available") + assert tu._acceptance_fn_reciprocal( + dpt.float32, dpt.float32, dpt.float32, dev + ) + assert tu._acceptance_fn_negative(dpt.int8, dpt.int16, dpt.int16, dev) + + +def test_weak_types(): + wbt = tu.WeakBooleanType(True) + assert wbt.get() + assert tu._weak_type_num_kind(wbt) == 0 + + wit = tu.WeakIntegralType(7) + assert wit.get() == 7 + assert tu._weak_type_num_kind(wit) == 1 + + wft = tu.WeakFloatingType(3.1415926) + assert wft.get() == 3.1415926 + assert tu._weak_type_num_kind(wft) == 2 + + wct = tu.WeakComplexType(2.0 + 3.0j) + assert wct.get() == 2 + 3j + assert tu._weak_type_num_kind(wct) == 3 + + +def test_arg_validation(): + with pytest.raises(TypeError): + tu._weak_type_num_kind(dict()) + + with pytest.raises(TypeError): + tu._strong_dtype_num_kind(Ellipsis) + + with pytest.raises(ValueError): + tu._strong_dtype_num_kind(np.dtype("O")) + + wt = tu.WeakFloatingType(2.0) + with pytest.raises(ValueError): + tu._resolve_weak_types(wt, wt, None) diff --git a/dpnp/tests/tensor/elementwise/utils.py b/dpnp/tests/tensor/elementwise/utils.py new file mode 100644 index 000000000000..6717ea577bd3 --- /dev/null +++ b/dpnp/tests/tensor/elementwise/utils.py @@ -0,0 +1,74 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import dpctl + +import dpnp.tensor._type_utils as tu + +_integral_dtypes = [ + "i1", + "u1", + "i2", + "u2", + "i4", + "u4", + "i8", + "u8", +] +_real_fp_dtypes = ["f2", "f4", "f8"] +_complex_fp_dtypes = [ + "c8", + "c16", +] +_real_value_dtypes = _integral_dtypes + _real_fp_dtypes +_no_complex_dtypes = [ + "b1", +] + _real_value_dtypes +_all_dtypes = _no_complex_dtypes + _complex_fp_dtypes + +_usm_types = ["device", "shared", "host"] + + +def _map_to_device_dtype(dt, dev): + return tu._to_device_supported_dtype(dt, dev) + + +def _compare_dtypes(dt, ref_dt, sycl_queue=None): + assert isinstance(sycl_queue, dpctl.SyclQueue) + dev = sycl_queue.sycl_device + expected_dt = _map_to_device_dtype(ref_dt, dev) + return dt == expected_dt + + +__all__ = [ + "_no_complex_dtypes", + "_all_dtypes", + "_usm_types", + "_map_to_device_dtype", + "_compare_dtypes", +] diff --git a/dpnp/tests/tensor/helper/__init__.py b/dpnp/tests/tensor/helper/__init__.py new file mode 100644 index 000000000000..7fdb1fbe553b --- /dev/null +++ b/dpnp/tests/tensor/helper/__init__.py @@ -0,0 +1,47 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +"""Helper module for tensor tests""" + +from ._helper import ( + create_invalid_capsule, + get_queue_or_skip, + has_cpu, + has_gpu, + has_sycl_platforms, + skip_if_dtype_not_supported, +) + +__all__ = [ + "create_invalid_capsule", + "has_cpu", + "has_gpu", + "has_sycl_platforms", + "get_queue_or_skip", + "skip_if_dtype_not_supported", +] diff --git a/dpnp/tests/tensor/helper/_helper.py b/dpnp/tests/tensor/helper/_helper.py new file mode 100644 index 000000000000..5d0b4825e953 --- /dev/null +++ b/dpnp/tests/tensor/helper/_helper.py @@ -0,0 +1,89 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import dpctl +import pytest + + +def has_gpu(backend="opencl"): + return bool(dpctl.get_num_devices(backend=backend, device_type="gpu")) + + +def has_cpu(backend="opencl"): + return bool(dpctl.get_num_devices(backend=backend, device_type="cpu")) + + +def has_sycl_platforms(): + return bool(len(dpctl.get_platforms())) + + +def create_invalid_capsule(): + """Creates an invalid capsule for the purpose of testing dpctl + constructors that accept capsules. + """ + import ctypes + + ctor = ctypes.pythonapi.PyCapsule_New + ctor.restype = ctypes.py_object + ctor.argtypes = [ctypes.c_void_p, ctypes.c_char_p, ctypes.c_void_p] + return ctor(id(ctor), b"invalid", 0) + + +def get_queue_or_skip(args=()): + try: + q = dpctl.SyclQueue(*args) + except dpctl.SyclQueueCreationError: + pytest.skip(f"Queue could not be created from {args}") + return q + + +def skip_if_dtype_not_supported(dt, q_or_dev): + import dpnp.tensor as dpt + + dt = dpt.dtype(dt) + if type(q_or_dev) is dpctl.SyclQueue: + dev = q_or_dev.sycl_device + elif type(q_or_dev) is dpctl.SyclDevice: + dev = q_or_dev + else: + raise TypeError( + "Expected dpctl.SyclQueue or dpctl.SyclDevice, " + f"got {type(q_or_dev)}" + ) + dev_has_dp = dev.has_aspect_fp64 + if dev_has_dp is False and dt in [dpt.float64, dpt.complex128]: + pytest.skip( + f"{dev.name} does not support double precision floating point types" + ) + dev_has_hp = dev.has_aspect_fp16 + if dev_has_hp is False and dt in [ + dpt.float16, + ]: + pytest.skip( + f"{dev.name} does not support half precision floating point type" + ) diff --git a/dpnp/tests/tensor/test_tensor_accumulation.py b/dpnp/tests/tensor/test_tensor_accumulation.py new file mode 100644 index 000000000000..b7ea9147e100 --- /dev/null +++ b/dpnp/tests/tensor/test_tensor_accumulation.py @@ -0,0 +1,449 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +from random import randrange + +import pytest + +import dpnp.tensor as dpt + +from .helper import ( + get_queue_or_skip, + skip_if_dtype_not_supported, +) + +sint_types = [ + dpt.int8, + dpt.int16, + dpt.int32, + dpt.int64, +] +uint_types = [ + dpt.uint8, + dpt.uint16, + dpt.uint32, + dpt.uint64, +] +rfp_types = [ + dpt.float16, + dpt.float32, + dpt.float64, +] +cfp_types = [ + dpt.complex64, + dpt.complex128, +] + +no_complex_types = [dpt.bool] + sint_types + uint_types + rfp_types + +all_types = [dpt.bool] + sint_types + uint_types + rfp_types + cfp_types + + +@pytest.mark.parametrize("dt", sint_types) +def test_contig_cumsum_sint(dt): + get_queue_or_skip() + n = 10000 + x = dpt.repeat(dpt.asarray([1, -1], dtype=dt), n) + + res = dpt.cumulative_sum(x, dtype=dt) + + ar = dpt.arange(n, dtype=dt) + expected = dpt.concat((1 + ar, dpt.flip(ar))) + assert dpt.all(res == expected) + + +@pytest.mark.parametrize("dt", sint_types) +def test_strided_cumsum_sint(dt): + get_queue_or_skip() + n = 10000 + x = dpt.repeat(dpt.asarray([1, -1], dtype=dt), 2 * n)[1::2] + + res = dpt.cumulative_sum(x, dtype=dt) + + ar = dpt.arange(n, dtype=dt) + expected = dpt.concat((1 + ar, dpt.flip(ar))) + assert dpt.all(res == expected) + + x2 = dpt.repeat(dpt.asarray([-1, 1], dtype=dt), 2 * n)[-1::-2] + + res = dpt.cumulative_sum(x2, dtype=dt) + + ar = dpt.arange(n, dtype=dt) + expected = dpt.concat((1 + ar, dpt.flip(ar))) + assert dpt.all(res == expected) + + +@pytest.mark.parametrize("dt", sint_types) +def test_contig_cumsum_axis_sint(dt): + get_queue_or_skip() + n0, n1 = 1000, 173 + x = dpt.repeat(dpt.asarray([1, -1], dtype=dt), n0) + m = dpt.tile(dpt.expand_dims(x, axis=1), (1, n1)) + + res = dpt.cumulative_sum(m, dtype=dt, axis=0) + + ar = dpt.arange(n0, dtype=dt) + expected = dpt.concat((1 + ar, dpt.flip(ar))) + assert dpt.all(res == dpt.expand_dims(expected, axis=1)) + + +@pytest.mark.parametrize("dt", sint_types) +def test_strided_cumsum_axis_sint(dt): + get_queue_or_skip() + n0, n1 = 1000, 173 + x = dpt.repeat(dpt.asarray([1, -1], dtype=dt), 2 * n0) + m = dpt.tile(dpt.expand_dims(x, axis=1), (1, n1))[1::2, ::-1] + + res = dpt.cumulative_sum(m, dtype=dt, axis=0) + + ar = dpt.arange(n0, dtype=dt) + expected = dpt.concat((1 + ar, dpt.flip(ar))) + assert dpt.all(res == dpt.expand_dims(expected, axis=1)) + + +def test_accumulate_scalar(): + get_queue_or_skip() + + s = dpt.asarray(1, dtype="i8") + r = dpt.cumulative_sum(s) + assert r == s + assert r.ndim == s.ndim + + r = dpt.cumulative_sum(s, include_initial=True) + r_expected = dpt.asarray([0, 1], dtype="i8") + assert dpt.all(r == r_expected) + + +def test_cumulative_sum_include_initial(): + get_queue_or_skip() + + n0, n1 = 3, 5 + x = dpt.ones((n0, n1), dtype="i4") + r = dpt.cumulative_sum(x, axis=0, include_initial=True) + assert dpt.all(r[0, :] == 0) + + r = dpt.cumulative_sum(x, axis=1, include_initial=True) + assert dpt.all(r[:, 0] == 0) + + x = dpt.ones(n1, dtype="i4") + r = dpt.cumulative_sum(x, include_initial=True) + assert r.shape == (n1 + 1,) + assert r[0] == 0 + + +def test_cumulative_prod_identity(): + get_queue_or_skip() + + x = dpt.zeros(5, dtype="i4") + r = dpt.cumulative_prod(x, include_initial=True) + assert r[0] == 1 + + +def test_cumulative_logsumexp_identity(): + get_queue_or_skip() + + x = dpt.ones(5, dtype="f4") + r = dpt.cumulative_logsumexp(x, include_initial=True) + assert r[0] == -dpt.inf + + +def test_accumulate_zero_size_dims(): + get_queue_or_skip() + + n0, n1, n2 = 3, 0, 5 + x = dpt.ones((n0, n1, n2), dtype="i8") + r = dpt.cumulative_sum(x, axis=1) + assert r.shape == x.shape + assert r.size == 0 + + r = dpt.cumulative_sum(x, axis=0) + assert r.shape == x.shape + assert r.size == 0 + + r = dpt.cumulative_sum(x, axis=1, include_initial=True) + assert r.shape == (n0, n1 + 1, n2) + assert r.size == (n0 * n2) + + r = dpt.cumulative_sum(x, axis=0, include_initial=True) + assert r.shape == (n0 + 1, n1, n2) + assert r.size == 0 + + +@pytest.mark.parametrize("arg_dtype", all_types) +def test_cumsum_arg_dtype_default_output_dtype_matrix(arg_dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(arg_dtype, q) + + n = 100 + x = dpt.ones(n, dtype=arg_dtype) + r = dpt.cumulative_sum(x) + + assert isinstance(r, dpt.usm_ndarray) + if x.dtype.kind == "i": + assert r.dtype.kind == "i" + elif x.dtype.kind == "u": + assert r.dtype.kind == "u" + elif x.dtype.kind == "fc": + assert r.dtype == arg_dtype + + r_expected = dpt.arange(1, n + 1, dtype=r.dtype) + + assert dpt.all(r == r_expected) + + +@pytest.mark.parametrize("arg_dtype", all_types) +@pytest.mark.parametrize("out_dtype", all_types) +def test_cumsum_arg_out_dtype_matrix(arg_dtype, out_dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(arg_dtype, q) + skip_if_dtype_not_supported(out_dtype, q) + + n = 100 + x = dpt.ones(n, dtype=arg_dtype) + r = dpt.cumulative_sum(x, dtype=out_dtype) + + assert isinstance(r, dpt.usm_ndarray) + assert r.dtype == dpt.dtype(out_dtype) + if out_dtype == dpt.bool: + assert dpt.all(r) + else: + r_expected = dpt.arange(1, n + 1, dtype=out_dtype) + assert dpt.all(r == r_expected) + + +def test_accumulator_out_kwarg(): + q = get_queue_or_skip() + + n = 100 + + expected = dpt.arange(1, n + 1, dtype="i4", sycl_queue=q) + x = dpt.ones(n, dtype="i4", sycl_queue=q) + out = dpt.empty_like(x, dtype="i4") + dpt.cumulative_sum(x, dtype="i4", out=out) + assert dpt.all(expected == out) + + # overlap + x = dpt.ones(n, dtype="i4", sycl_queue=q) + dpt.cumulative_sum(x, dtype="i4", out=x) + assert dpt.all(x == expected) + + # axis before final axis + expected = dpt.broadcast_to( + dpt.arange(1, n + 1, dtype="i4", sycl_queue=q), (n, n) + ).mT + x = dpt.ones((n, n), dtype="i4", sycl_queue=q) + out = dpt.empty_like(x, dtype="i4") + dpt.cumulative_sum(x, axis=0, dtype="i4", out=out) + assert dpt.all(expected == out) + + # scalar + x = dpt.asarray(3, dtype="i4") + out = dpt.empty((), dtype="i4") + expected = 3 + dpt.cumulative_sum(x, dtype="i4", out=out) + assert expected == out + + +def test_accumulator_arg_validation(): + q1 = get_queue_or_skip() + q2 = get_queue_or_skip() + + n = 5 + x1 = dpt.ones((n, n), dtype="f4", sycl_queue=q1) + x2 = dpt.ones(n, dtype="f4", sycl_queue=q1) + + # must be usm_ndarray + with pytest.raises(TypeError): + dpt.cumulative_sum(dict()) + + # axis must be specified when input not 1D + with pytest.raises(ValueError): + dpt.cumulative_sum(x1) + + # out must be usm_ndarray + with pytest.raises(TypeError): + dpt.cumulative_sum(x2, out=dict()) + + # out must be writable + out_not_writable = dpt.empty_like(x2) + out_not_writable.flags.writable = False + with pytest.raises(ValueError): + dpt.cumulative_sum(x2, out=out_not_writable) + + # out must be expected shape + out_wrong_shape = dpt.ones(n + 1, dtype=x2.dtype, sycl_queue=q1) + with pytest.raises(ValueError): + dpt.cumulative_sum(x2, out=out_wrong_shape) + + # out must be expected dtype + out_wrong_dtype = dpt.empty_like(x2, dtype="i4") + with pytest.raises(ValueError): + dpt.cumulative_sum(x2, out=out_wrong_dtype) + + # compute follows data + out_wrong_queue = dpt.empty_like(x2, sycl_queue=q2) + with pytest.raises(dpt.ExecutionPlacementError): + dpt.cumulative_sum(x2, out=out_wrong_queue) + + +def test_cumsum_nan_propagation(): + get_queue_or_skip() + + n = 100 + x = dpt.ones(n, dtype="f4") + i = randrange(n) + x[i] = dpt.nan + + r = dpt.cumulative_sum(x) + assert dpt.all(dpt.isnan(r[i:])) + + +def test_cumprod_nan_propagation(): + get_queue_or_skip() + + n = 100 + x = dpt.ones(n, dtype="f4") + i = randrange(n) + x[i] = dpt.nan + + r = dpt.cumulative_prod(x) + assert dpt.all(dpt.isnan(r[i:])) + + +def test_logcumsumexp_nan_propagation(): + get_queue_or_skip() + + n = 100 + x = dpt.ones(n, dtype="f4") + i = randrange(n) + x[i] = dpt.nan + + r = dpt.cumulative_logsumexp(x) + assert dpt.all(dpt.isnan(r[i:])) + + +@pytest.mark.parametrize("arg_dtype", no_complex_types) +def test_logcumsumexp_arg_dtype_default_output_dtype_matrix(arg_dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(arg_dtype, q) + + x = dpt.ones(10, dtype=arg_dtype, sycl_queue=q) + r = dpt.cumulative_logsumexp(x) + + if arg_dtype.kind in "biu": + assert r.dtype.kind == "f" + else: + assert r.dtype == arg_dtype + + +def test_logcumsumexp_complex_error(): + get_queue_or_skip() + + x = dpt.ones(10, dtype="c8") + with pytest.raises(ValueError): + dpt.cumulative_logsumexp(x) + + +def test_cumprod_basic(): + get_queue_or_skip() + + n = 50 + val = 2 + x = dpt.full(n, val, dtype="i8") + r = dpt.cumulative_prod(x) + expected = dpt.pow(val, dpt.arange(1, n + 1, dtype="i8")) + + assert dpt.all(r == expected) + + x = dpt.tile(dpt.asarray([2, 0.5], dtype="f4"), 10000) + expected = dpt.tile(dpt.asarray([2, 1], dtype="f4"), 10000) + r = dpt.cumulative_prod(x) + assert dpt.all(r == expected) + + +def test_logcumsumexp_basic(): + get_queue_or_skip() + + dt = dpt.float32 + x = dpt.ones(1000, dtype=dt) + r = dpt.cumulative_logsumexp(x) + + expected = 1 + dpt.log(dpt.arange(1, 1001, dtype=dt)) + + tol = 4 * dpt.finfo(dt).resolution + assert dpt.allclose(r, expected, atol=tol, rtol=tol) + + +def geometric_series_closed_form(n, dtype=None, device=None): + """Closed form for cumulative_logsumexp(dpt.arange(-n, 0)) + + :math:`r[k] == -n + k + log(1 - exp(-k-1)) - log(1-exp(-1))` + """ + x = dpt.arange(-n, 0, dtype=dtype, device=device) + y = dpt.arange(-1, -n - 1, step=-1, dtype=dtype, device=device) + y = dpt.exp(y, out=y) + y = dpt.negative(y, out=y) + y = dpt.log1p(y, out=y) + y -= y[0] + return x + y + + +@pytest.mark.parametrize("fpdt", rfp_types) +def test_cumulative_logsumexp_closed_form(fpdt): + q = get_queue_or_skip() + skip_if_dtype_not_supported(fpdt, q) + + n = 128 + r = dpt.cumulative_logsumexp(dpt.arange(-n, 0, dtype=fpdt, device=q)) + expected = geometric_series_closed_form(n, dtype=fpdt, device=q) + + tol = 4 * dpt.finfo(fpdt).eps + assert dpt.allclose(r, expected, atol=tol, rtol=tol) + + +@pytest.mark.parametrize("p", [257, 260, 273, 280, 509, 512]) +def test_cumulative_sum_gh_1901(p): + get_queue_or_skip() + + n = p * p + dt = dpt.int32 + inp = dpt.ones(n, dtype=dt) + r = dpt.cumulative_sum(inp, dtype=dt) + assert dpt.all(r == dpt.arange(1, n + 1, dtype=dt)) + + +@pytest.mark.parametrize( + "dt", ["i1", "i2", "i4", "i8", "f2", "f4", "f8", "c8", "c16"] +) +def test_gh_2017(dt): + "See https://github.com/IntelPython/dpctl/issues/2017" + q = get_queue_or_skip() + skip_if_dtype_not_supported(dt, q) + x = dpt.asarray([-1, 1], dtype=dpt.dtype(dt), sycl_queue=q) + r = dpt.cumulative_sum(x, dtype="?") + assert dpt.all(r) diff --git a/dpnp/tests/tensor/test_tensor_array_api_inspection.py b/dpnp/tests/tensor/test_tensor_array_api_inspection.py new file mode 100644 index 000000000000..2eb198944656 --- /dev/null +++ b/dpnp/tests/tensor/test_tensor_array_api_inspection.py @@ -0,0 +1,238 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import dpctl +import pytest + +import dpnp.tensor as dpt +from dpnp.tensor._tensor_impl import ( + default_device_complex_type, + default_device_fp_type, + default_device_index_type, + default_device_int_type, +) + +_dtypes_no_fp16_fp64 = { + "bool": dpt.bool, + "float32": dpt.float32, + "complex64": dpt.complex64, + "int8": dpt.int8, + "int16": dpt.int16, + "int32": dpt.int32, + "int64": dpt.int64, + "uint8": dpt.uint8, + "uint16": dpt.uint16, + "uint32": dpt.uint32, + "uint64": dpt.uint64, +} + + +def test_array_api_inspection_methods(): + info = dpt.__array_namespace_info__() + assert info.capabilities() + try: + assert info.default_device() + except dpctl.SyclDeviceCreationError: + pytest.skip("No default device available") + assert info.default_dtypes() + assert info.devices() + assert info.dtypes() + + +def test_array_api_inspection_default_device(): + try: + dev = dpctl.select_default_device() + except dpctl.SyclDeviceCreationError: + pytest.skip("No default device available") + assert dpt.__array_namespace_info__().default_device() == dev + + +def test_array_api_inspection_devices(): + try: + devices2 = dpctl.get_devices() + except dpctl.SyclDeviceCreationError: + pytest.skip("No default device available") + devices1 = dpt.__array_namespace_info__().devices() + assert len(devices1) == len(devices2) + assert devices1 == devices2 + + +def test_array_api_inspection_capabilities(): + capabilities = dpt.__array_namespace_info__().capabilities() + assert capabilities["boolean indexing"] + assert capabilities["data-dependent shapes"] + assert capabilities["max dimensions"] is None + + +def test_array_api_inspection_default_dtypes(): + try: + dev = dpctl.select_default_device() + except dpctl.SyclDeviceCreationError: + pytest.skip("No default device available") + + int_dt = default_device_int_type(dev) + ind_dt = default_device_index_type(dev) + fp_dt = default_device_fp_type(dev) + cm_dt = default_device_complex_type(dev) + + info = dpt.__array_namespace_info__() + default_dts_nodev = info.default_dtypes() + default_dts_dev = info.default_dtypes(device=dev) + + assert ( + int_dt == default_dts_nodev["integral"] == default_dts_dev["integral"] + ) + assert ( + ind_dt == default_dts_nodev["indexing"] == default_dts_dev["indexing"] + ) + assert ( + fp_dt + == default_dts_nodev["real floating"] + == default_dts_dev["real floating"] + ) + assert ( + cm_dt + == default_dts_nodev["complex floating"] + == default_dts_dev["complex floating"] + ) + + +def test_array_api_inspection_default_device_dtypes(): + try: + dev = dpctl.select_default_device() + except dpctl.SyclDeviceCreationError: + pytest.skip("No default device available") + dtypes = _dtypes_no_fp16_fp64.copy() + if dev.has_aspect_fp64: + dtypes["float64"] = dpt.float64 + dtypes["complex128"] = dpt.complex128 + + assert dtypes == dpt.__array_namespace_info__().dtypes() + + +def test_array_api_inspection_device_dtypes(): + info = dpt.__array_namespace_info__() + try: + dev = info.default_device() + except dpctl.SyclDeviceCreationError: + pytest.skip("No default device available") + dtypes = _dtypes_no_fp16_fp64.copy() + if dev.has_aspect_fp64: + dtypes["float64"] = dpt.float64 + dtypes["complex128"] = dpt.complex128 + + assert dtypes == dpt.__array_namespace_info__().dtypes(device=dev) + + +def test_array_api_inspection_dtype_kind(): + info = dpt.__array_namespace_info__() + try: + info.default_device() + except dpctl.SyclDeviceCreationError: + pytest.skip("No default device available") + + f_dtypes = info.dtypes(kind="real floating") + assert all([_dt[1].kind == "f" for _dt in f_dtypes.items()]) + + i_dtypes = info.dtypes(kind="signed integer") + assert all([_dt[1].kind == "i" for _dt in i_dtypes.items()]) + + u_dtypes = info.dtypes(kind="unsigned integer") + assert all([_dt[1].kind == "u" for _dt in u_dtypes.items()]) + + ui_dtypes = info.dtypes(kind="unsigned integer") + assert all([_dt[1].kind in "ui" for _dt in ui_dtypes.items()]) + + c_dtypes = info.dtypes(kind="complex floating") + assert all([_dt[1].kind == "c" for _dt in c_dtypes.items()]) + + assert info.dtypes(kind="bool") == {"bool": dpt.bool} + + _signed_ints = { + "int8": dpt.int8, + "int16": dpt.int16, + "int32": dpt.int32, + "int64": dpt.int64, + } + assert ( + info.dtypes(kind=("signed integer", "signed integer")) == _signed_ints + ) + assert ( + info.dtypes( + kind=("integral", "bool", "real floating", "complex floating") + ) + == info.dtypes() + ) + assert info.dtypes( + kind=("integral", "real floating", "complex floating") + ) == info.dtypes(kind="numeric") + + +def test_array_api_inspection_dtype_kind_errors(): + info = dpt.__array_namespace_info__() + try: + info.default_device() + except dpctl.SyclDeviceCreationError: + pytest.skip("No default device available") + + with pytest.raises(ValueError): + info.dtypes(kind="error") + + with pytest.raises(TypeError): + info.dtypes(kind={0: "real floating"}) + + +def test_array_api_inspection_device_types(): + info = dpt.__array_namespace_info__() + try: + dev = info.default_device() + except dpctl.SyclDeviceCreationError: + pytest.skip("No default device available") + + q = dpctl.SyclQueue(dev) + assert info.default_dtypes(device=q) + assert info.dtypes(device=q) + + dev_dpt = dpt.Device.create_device(dev) + assert info.default_dtypes(device=dev_dpt) + assert info.dtypes(device=dev_dpt) + + filter = dev.get_filter_string() + assert info.default_dtypes(device=filter) + assert info.dtypes(device=filter) + + +def test_array_api_inspection_device_errors(): + info = dpt.__array_namespace_info__() + + bad_dev = {} + with pytest.raises(TypeError): + info.dtypes(device=bad_dev) + + with pytest.raises(TypeError): + info.default_dtypes(device=bad_dev) diff --git a/dpnp/tests/tensor/test_tensor_asarray.py b/dpnp/tests/tensor/test_tensor_asarray.py new file mode 100644 index 000000000000..f5caacacdac6 --- /dev/null +++ b/dpnp/tests/tensor/test_tensor_asarray.py @@ -0,0 +1,664 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import dpctl +import numpy as np +import pytest + +import dpnp.tensor as dpt + +from .helper import ( + get_queue_or_skip, + skip_if_dtype_not_supported, +) + + +@pytest.mark.parametrize( + "src_usm_type, dst_usm_type", + [ + ("device", "shared"), + ("device", "host"), + ("shared", "device"), + ("shared", "host"), + ("host", "device"), + ("host", "shared"), + ], +) +def test_asarray_change_usm_type(src_usm_type, dst_usm_type): + try: + d = dpctl.SyclDevice() + except dpctl.SyclDeviceCreationError: + pytest.skip("No SYCL devices available") + X = dpt.empty(10, dtype="u1", usm_type=src_usm_type) + Y = dpt.asarray(X, usm_type=dst_usm_type) + assert X.shape == Y.shape + assert X.usm_type == src_usm_type + assert Y.usm_type == dst_usm_type + + with pytest.raises(ValueError): + # zero copy is not possible + dpt.asarray(X, usm_type=dst_usm_type, copy=False) + + Y = dpt.asarray(X, usm_type=dst_usm_type, sycl_queue=X.sycl_queue) + assert X.shape == Y.shape + assert Y.usm_type == dst_usm_type + + Y = dpt.asarray( + X, + usm_type=dst_usm_type, + sycl_queue=X.sycl_queue, + device=d.get_filter_string(), + ) + assert X.shape == Y.shape + assert Y.usm_type == dst_usm_type + + +def test_asarray_from_numpy(): + Xnp = np.arange(10) + try: + Y = dpt.asarray(Xnp, usm_type="device") + except dpctl.SyclDeviceCreationError: + pytest.skip("No SYCL devices available") + assert type(Y) is dpt.usm_ndarray + assert Y.shape == Xnp.shape + assert Y.dtype == Xnp.dtype + # Fortran contiguous case + Xnp = np.array([[1, 2, 3], [4, 5, 6]], dtype="f4", order="F") + Y = dpt.asarray(Xnp, usm_type="shared") + assert type(Y) is dpt.usm_ndarray + assert Y.shape == Xnp.shape + assert Y.dtype == Xnp.dtype + # general strided case + Xnp = np.array([[1, 2, 3], [4, 5, 6]], dtype="i8") + Y = dpt.asarray(Xnp[::-1, ::-1], usm_type="host") + assert type(Y) is dpt.usm_ndarray + assert Y.shape == Xnp.shape + assert Y.dtype == Xnp.dtype + + +def test_asarray_from_sequence(): + X = [1, 2, 3] + try: + Y = dpt.asarray(X, usm_type="device") + except dpctl.SyclDeviceCreationError: + pytest.skip("No SYCL devices available") + assert type(Y) is dpt.usm_ndarray + + X = [(1, 1), (2.0, 2.0 + 1.0j), range(4, 6), np.array([3, 4], dtype="c16")] + Y = dpt.asarray(X, usm_type="device") + assert type(Y) is dpt.usm_ndarray + assert Y.ndim == 2 + assert Y.shape == (len(X), 2) + + X = [] + Y = dpt.asarray(X, usm_type="device") + assert type(Y) is dpt.usm_ndarray + assert Y.shape == (0,) + + X = [[], []] + Y = dpt.asarray(X, usm_type="device") + assert type(Y) is dpt.usm_ndarray + assert Y.shape == (2, 0) + + X = [True, False] + Y = dpt.asarray(X, usm_type="device") + assert type(Y) is dpt.usm_ndarray + assert Y.dtype.kind == "b" + + +def test_asarray_from_object_with_suai(): + """Test that asarray can deal with opaque objects implementing SUAI""" + + class Dummy: + def __init__(self, obj, iface): + self.obj = obj + self.__sycl_usm_array_interface__ = iface + + try: + X = dpt.empty((2, 3, 4), dtype="f4") + except dpctl.SyclDeviceCreationError: + pytest.skip("No SYCL devices available") + Y = dpt.asarray(Dummy(X, X.__sycl_usm_array_interface__)) + assert Y.shape == X.shape + assert X.usm_type == Y.usm_type + assert X.dtype == Y.dtype + assert X.sycl_device == Y.sycl_device + + +def test_asarray_input_validation(): + with pytest.raises(TypeError): + # copy keyword is not of right type + dpt.asarray([1], copy="invalid") + with pytest.raises(TypeError): + # order keyword is not valid + dpt.asarray([1], order=1) + with pytest.raises(TypeError): + # dtype is not valid + dpt.asarray([1], dtype="invalid") + with pytest.raises(ValueError): + # unexpected value of order + dpt.asarray([1], order="Z") + with pytest.raises(TypeError): + # usm_type is of wrong type + dpt.asarray([1], usm_type=dict()) + with pytest.raises(ValueError): + # usm_type has wrong value + dpt.asarray([1], usm_type="mistake") + try: + wrong_queue_type = dpctl.SyclContext() + except dpctl.SyclContextCreationError: + # use any other type + wrong_queue_type = Ellipsis + with pytest.raises(TypeError): + # sycl_queue type is not right + dpt.asarray([1], sycl_queue=wrong_queue_type) + with pytest.raises(ValueError): + # sequence is not rectangular + dpt.asarray([[1], 2]) + with pytest.raises(OverflowError): + # Python int too large for type + dpt.asarray(-9223372036854775809, dtype="i4") + with pytest.raises(ValueError): + # buffer to usm_ndarray requires a copy + dpt.asarray(memoryview(np.arange(5)), copy=False) + with pytest.raises(ValueError): + # Numpy array to usm_ndarray requires a copy + dpt.asarray(np.arange(5), copy=False) + with pytest.raises(ValueError): + # Python sequence to usm_ndarray requires a copy + dpt.asarray([1, 2, 3], copy=False) + with pytest.raises(ValueError): + # Python scalar to usm_ndarray requires a copy + dpt.asarray(5, copy=False) + + +def test_asarray_input_validation2(): + d = dpctl.get_devices() + if len(d) < 2: + pytest.skip("Not enough SYCL devices available") + + d0, d1 = d[:2] + try: + q0 = dpctl.SyclQueue(d0) + except dpctl.SyclQueueCreationError: + pytest.skip(f"SyclQueue could not be created for {d0}") + try: + q1 = dpctl.SyclQueue(d1) + except dpctl.SyclQueueCreationError: + pytest.skip(f"SyclQueue could not be created for {d1}") + with pytest.raises(TypeError): + dpt.asarray([1, 2], sycl_queue=q0, device=q1) + + +def test_asarray_scalars(): + import ctypes + + try: + Y = dpt.asarray(5) + except dpctl.SyclDeviceCreationError: + pytest.skip("No SYCL devices available") + assert Y.dtype == dpt.dtype(int) + Y = dpt.asarray(5.2) + if Y.sycl_device.has_aspect_fp64: + assert Y.dtype == dpt.dtype(float) + else: + assert Y.dtype == dpt.dtype(dpt.float32) + Y = dpt.asarray(np.float32(2.3)) + assert Y.dtype == dpt.dtype(dpt.float32) + Y = dpt.asarray(1.0j) + if Y.sycl_device.has_aspect_fp64: + assert Y.dtype == dpt.dtype(complex) + else: + assert Y.dtype == dpt.dtype(dpt.complex64) + Y = dpt.asarray(ctypes.c_int(8)) + assert Y.dtype == dpt.dtype(ctypes.c_int) + + +def test_asarray_copy_false(): + q = get_queue_or_skip() + rng = np.random.default_rng() + Xnp = rng.integers(low=-255, high=255, size=(10, 4), dtype=np.int64) + X = dpt.from_numpy(Xnp, usm_type="device", sycl_queue=q) + Y1 = dpt.asarray(X, copy=False, order="K") + assert Y1 is X + Y1c = dpt.asarray(X, copy=True, order="K") + assert not (Y1c is X) + Y2 = dpt.asarray(X, copy=False, order="C") + assert Y2 is X + Y3 = dpt.asarray(X, copy=False, order="A") + assert Y3 is X + with pytest.raises(ValueError): + Y1 = dpt.asarray(X, copy=False, order="F") + Xf = dpt.empty( + X.shape, + dtype=X.dtype, + usm_type="device", + sycl_queue=X.sycl_queue, + order="F", + ) + Xf[:] = X + Y4 = dpt.asarray(Xf, copy=False, order="K") + assert Y4 is Xf + Y5 = dpt.asarray(Xf, copy=False, order="F") + assert Y5 is Xf + Y6 = dpt.asarray(Xf, copy=False, order="A") + assert Y6 is Xf + with pytest.raises(ValueError): + dpt.asarray(Xf, copy=False, order="C") + + +def test_asarray_invalid_dtype(): + q = get_queue_or_skip() + Xnp = np.array([1, 2, 3], dtype=object) + with pytest.raises(TypeError): + dpt.asarray(Xnp, sycl_queue=q) + + +def test_asarray_cross_device(): + q = get_queue_or_skip() + qprof = dpctl.SyclQueue(property="enable_profiling") + x = dpt.empty(10, dtype="i8", sycl_queue=q) + y = dpt.asarray(x, sycl_queue=qprof) + assert y.sycl_queue == qprof + + +def test_asarray_seq_of_arrays_simple(): + get_queue_or_skip() + r = dpt.arange(10) + m = dpt.asarray( + [ + r, + ] + * 4 + ) + assert m.shape == (4,) + r.shape + assert m.dtype == r.dtype + assert m.device == r.device + + +def test_asarray_seq_of_arrays(): + get_queue_or_skip() + m = dpt.ones((2, 4), dtype="i4") + w = dpt.zeros(4) + v = dpt.full(4, -1) + ar = dpt.asarray([m, [w, v]]) + assert ar.shape == (2, 2, 4) + assert ar.device == m.device + assert ar.device == w.device + assert ar.device == v.device + + +def test_asarray_seq_of_array_different_queue(): + get_queue_or_skip() + m = dpt.ones((2, 4), dtype="i4") + w = dpt.zeros(4) + v = dpt.full(4, -1) + qprof = dpctl.SyclQueue(property="enable_profiling") + ar = dpt.asarray([m, [w, v]], sycl_queue=qprof) + assert ar.shape == (2, 2, 4) + assert ar.sycl_queue == qprof + + +def test_asarray_seq_of_suai(): + get_queue_or_skip() + + class Dummy: + def __init__(self, obj, iface): + self.obj = obj + self.__sycl_usm_array_interface__ = iface + + o = dpt.empty(0, usm_type="shared") + d = Dummy(o, o.__sycl_usm_array_interface__) + x = dpt.asarray(d) + assert x.shape == (0,) + assert x.usm_type == o.usm_type + assert x._pointer == o._pointer + assert x.sycl_queue == o.sycl_queue + + x = dpt.asarray([d, d]) + assert x.shape == (2, 0) + assert x.usm_type == o.usm_type + assert x.sycl_queue == o.sycl_queue + + +def test_asarray_seq_of_suai_different_queue(): + q = get_queue_or_skip() + + class Dummy: + def __init__(self, obj, iface): + self.obj = obj + self.__sycl_usm_array_interface__ = iface + + @property + def shape(self): + return self.__sycl_usm_array_interface__["shape"] + + q2 = dpctl.SyclQueue() + assert q != q2 + o = dpt.empty((2, 2), usm_type="shared", sycl_queue=q2) + d = Dummy(o, o.__sycl_usm_array_interface__) + + x = dpt.asarray(d, sycl_queue=q) + assert x.sycl_queue == q + assert x.shape == d.shape + x = dpt.asarray([d], sycl_queue=q) + assert x.sycl_queue == q + assert x.shape == (1,) + d.shape + x = dpt.asarray([d, d], sycl_queue=q) + assert x.sycl_queue == q + assert x.shape == (2,) + d.shape + + +def test_asarray_seq_of_arrays_on_different_queues(): + q = get_queue_or_skip() + + m = dpt.empty((2, 4), dtype="i2", sycl_queue=q) + q2 = dpctl.SyclQueue() + w = dpt.empty(4, dtype="i1", sycl_queue=q2) + q3 = dpctl.SyclQueue() + py_seq = [ + 0, + ] * w.shape[0] + res = dpt.asarray([m, [w, py_seq]], sycl_queue=q3) + assert res.sycl_queue == q3 + assert dpt.isdtype(res.dtype, "integral") + + res = dpt.asarray([m, [w, range(w.shape[0])]], sycl_queue=q3) + assert res.sycl_queue == q3 + assert dpt.isdtype(res.dtype, "integral") + + res = dpt.asarray([m, [w, w]], sycl_queue=q) + assert res.sycl_queue == q + assert dpt.isdtype(res.dtype, "integral") + + res = dpt.asarray([m, [w, dpt.asnumpy(w)]], sycl_queue=q2) + assert res.sycl_queue == q2 + assert dpt.isdtype(res.dtype, "integral") + + res = dpt.asarray([w, dpt.asnumpy(w)]) + assert res.sycl_queue == w.sycl_queue + assert dpt.isdtype(res.dtype, "integral") + + with pytest.raises(dpt.ExecutionPlacementError): + dpt.asarray([m, [w, py_seq]]) + + +def test_ulonglong_gh_1167(): + get_queue_or_skip() + x = dpt.asarray(9223372036854775807, dtype="u8") + assert x.dtype == dpt.uint64 + x = dpt.asarray(9223372036854775808, dtype="u8") + assert x.dtype == dpt.uint64 + + +def test_orderK_gh_1350(): + get_queue_or_skip() + a = dpt.empty((2, 3, 4), dtype="u1") + b = dpt.permute_dims(a, (2, 0, 1)) + c = dpt.asarray(b, copy=True, order="K") + + assert c.shape == b.shape + assert c.strides == b.strides + assert c._element_offset == 0 + assert not c._pointer == b._pointer + + +def _typesafe_arange(n: int, dtype_: dpt.dtype, device: object): + n_half = n // 2 + if dtype_.kind in "ui": + ii = dpt.iinfo(dtype_) + m0 = max(ii.min, -n_half) + m1 = min(m0 + n, ii.max) + n_tiles = (n + m1 - m0 - 1) // (m1 - m0) + res = dpt.arange(m0, m1, dtype=dtype_, device=device) + elif dtype_.kind == "b": + n_tiles = (n + 1) // 2 + res = dpt.asarray([False, True], dtype=dtype_, device=device) + else: + m0 = -n_half + m1 = m0 + n + n_tiles = 1 + res = dpt.linspace(m0, m1, num=n, dtype=dtype_, device=device) + if n_tiles > 1: + res = dpt.tile(res, n_tiles)[:n] + return res + + +_all_dtypes = [ + "b1", + "i1", + "u1", + "i2", + "u2", + "i4", + "u4", + "i8", + "u8", + "f2", + "f4", + "f8", + "c8", + "c16", +] + + +@pytest.mark.parametrize("dt", _all_dtypes) +def test_as_c_contig_rect(dt): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dt, q) + + dtype_ = dpt.dtype(dt) + n0, n1, n2 = 6, 35, 37 + + arr_flat = _typesafe_arange(n0 * n1 * n2, dtype_, q) + x = dpt.reshape(arr_flat, (n0, n1, n2)).mT + + y = dpt.asarray(x, order="C") + assert dpt.all(x == y) + + x2 = x[0] + y2 = dpt.asarray(x2, order="C") + assert dpt.all(x2 == y2) + + x3 = dpt.flip(x, axis=1) + y3 = dpt.asarray(x3, order="C") + assert dpt.all(x3 == y3) + + x4 = dpt.reshape(arr_flat, (2, 3, n1, n2)).mT + x5 = x4[:, :2] + y5 = dpt.asarray(x5, order="C") + assert dpt.all(x5 == y5) + + x6 = dpt.reshape(arr_flat, (n0, n1, n2), order="F") + y6 = dpt.asarray(x6, order="C") + assert dpt.all(x6 == y6) + + +@pytest.mark.parametrize("dt", _all_dtypes) +def test_as_f_contig_rect(dt): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dt, q) + + dtype_ = dpt.dtype(dt) + n0, n1, n2 = 6, 35, 37 + + arr_flat = _typesafe_arange(n0 * n1 * n2, dtype_, q) + x = dpt.reshape(arr_flat, (n0, n1, n2)) + + y = dpt.asarray(x, order="F") + assert dpt.all(x == y) + + x2 = x[0] + y2 = dpt.asarray(x2, order="F") + assert dpt.all(x2 == y2) + + x3 = dpt.flip(x, axis=1) + y3 = dpt.asarray(x3, order="F") + assert dpt.all(x3 == y3) + + x4 = dpt.reshape(arr_flat, (2, 3, n1, n2)) + x5 = dpt.moveaxis(x4[:, :2], (2, 3), (0, 1)) + y5 = dpt.asarray(x5, order="F") + assert dpt.all(x5 == y5) + + +@pytest.mark.parametrize("dt", _all_dtypes) +def test_as_c_contig_square(dt): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dt, q) + + dtype_ = dpt.dtype(dt) + n0, n1 = 4, 53 + + arr_flat = _typesafe_arange(n0 * n1 * n1, dtype_, q) + x = dpt.reshape(arr_flat, (n0, n1, n1)).mT + + y = dpt.asarray(x, order="C") + assert dpt.all(x == y) + + x2 = x[0] + y2 = dpt.asarray(x2, order="C") + assert dpt.all(x2 == y2) + + x3 = dpt.flip(x, axis=1) + y3 = dpt.asarray(x3, order="C") + assert dpt.all(x3 == y3) + + +@pytest.mark.parametrize("dt", _all_dtypes) +def test_as_f_contig_square(dt): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dt, q) + + dtype_ = dpt.dtype(dt) + n0, n1 = 6, 53 + + arr_flat = _typesafe_arange(n0 * n1 * n1, dtype_, q) + x = dpt.moveaxis(dpt.reshape(arr_flat, (n0, n1, n1)), (1, 2), (0, 1)) + + y = dpt.asarray(x, order="F") + assert dpt.all(x == y) + + x2 = x[..., 0] + y2 = dpt.asarray(x2, order="F") + assert dpt.all(x2 == y2) + + x3 = dpt.flip(x, axis=1) + y3 = dpt.asarray(x3, order="F") + assert dpt.all(x3 == y3) + + +class MockArrayWithBothProtocols: + """ + Object that implements both __sycl_usm_array_interface__ + and __usm_ndarray__ properties. + """ + + def __init__(self, usm_ar): + if not isinstance(usm_ar, dpt.usm_ndarray): + raise TypeError + self._arr = usm_ar + + @property + def __usm_ndarray__(self): + return self._arr + + @property + def __sycl_usm_array_interface__(self): + return self._arr.__sycl_usm_array_interface__ + + +class MockArrayWithSUAIOnly: + """ + Object that implements only the + __sycl_usm_array_interface__ property. + """ + + def __init__(self, usm_ar): + if not isinstance(usm_ar, dpt.usm_ndarray): + raise TypeError + self._arr = usm_ar + + @property + def __sycl_usm_array_interface__(self): + return self._arr.__sycl_usm_array_interface__ + + +@pytest.mark.parametrize("usm_type", ["shared", "device", "host"]) +def test_asarray_support_for_usm_ndarray_protocol(usm_type): + get_queue_or_skip() + + x = dpt.arange(256, dtype="i4", usm_type=usm_type) + + o1 = MockArrayWithBothProtocols(x) + o2 = MockArrayWithSUAIOnly(x) + + y1 = dpt.asarray(o1) + assert x.sycl_queue == y1.sycl_queue + assert x.usm_type == y1.usm_type + assert x.dtype == y1.dtype + assert y1.usm_data.reference_obj is None + assert dpt.all(x == y1) + + y2 = dpt.asarray(o2) + assert x.sycl_queue == y2.sycl_queue + assert x.usm_type == y2.usm_type + assert x.dtype == y2.dtype + assert not (y2.usm_data.reference_obj is None) + assert dpt.all(x == y2) + + y3 = dpt.asarray([o1, o2]) + assert x.sycl_queue == y3.sycl_queue + assert x.usm_type == y3.usm_type + assert x.dtype == y3.dtype + assert y3.usm_data.reference_obj is None + assert dpt.all(x[dpt.newaxis, :] == y3) + + +@pytest.mark.parametrize("dt", [dpt.float16, dpt.float64, dpt.complex128]) +def test_asarray_to_device_with_unsupported_dtype(dt): + aspect = "fp16" if dt == dpt.float16 else "fp64" + try: + d0 = dpctl.select_device_with_aspects(aspect) + except dpctl.SyclDeviceCreationError: + pytest.skip("No device with aspect for test") + d1 = None + for d in dpctl.get_devices(): + if d.default_selector_score < 0: + pass + try: + d1 = dpctl.select_device_with_aspects( + d.device_type.name, excluded_aspects=[aspect] + ) + except dpctl.SyclDeviceCreationError: + pass + if d1 is None: + pytest.skip("No device with missing aspect for test") + x = dpt.ones(10, dtype=dt, device=d0) + y = dpt.asarray(x, device=d1) + assert y.sycl_device == d1 diff --git a/dpnp/tests/tensor/test_tensor_clip.py b/dpnp/tests/tensor/test_tensor_clip.py new file mode 100644 index 000000000000..cfd9f6cfab2e --- /dev/null +++ b/dpnp/tests/tensor/test_tensor_clip.py @@ -0,0 +1,792 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import dpctl +import numpy as np +import pytest +from numpy.testing import assert_raises_regex + +import dpnp.tensor as dpt +from dpnp.tensor._elementwise_common import _get_dtype +from dpnp.tensor._type_utils import ( + _can_cast, + _strong_dtype_num_kind, + _weak_type_num_kind, +) + +from .helper import ( + get_queue_or_skip, + skip_if_dtype_not_supported, +) + +_all_dtypes = [ + "?", + "u1", + "i1", + "u2", + "i2", + "u4", + "i4", + "u8", + "i8", + "e", + "f", + "d", + "F", + "D", +] + +_usm_types = ["device", "shared", "host"] + + +@pytest.mark.parametrize("dt1", _all_dtypes) +@pytest.mark.parametrize("dt2", _all_dtypes) +def test_clip_dtypes(dt1, dt2): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dt1, q) + skip_if_dtype_not_supported(dt2, q) + + sz = 127 + ar1 = dpt.ones(sz, dtype=dt1, sycl_queue=q) + ar2 = dpt.ones_like(ar1, dtype=dt1, sycl_queue=q) + ar3 = dpt.ones_like(ar1, dtype=dt2, sycl_queue=q) + + dev = q.sycl_device + _fp16 = dev.has_aspect_fp16 + _fp64 = dev.has_aspect_fp64 + # also covers cases where dt1 == dt2 + if _can_cast(ar3.dtype, ar1.dtype, _fp16, _fp64): + r = dpt.clip(ar1, ar2, ar3) + assert isinstance(r, dpt.usm_ndarray) + assert r.dtype == ar1.dtype + assert r.shape == ar1.shape + assert dpt.all(r == ar1) + assert r.sycl_queue == ar1.sycl_queue + + r = dpt.clip(ar1, min=ar3, max=None) + assert isinstance(r, dpt.usm_ndarray) + assert r.dtype == ar1.dtype + assert r.shape == ar1.shape + assert dpt.all(r == ar1) + assert r.sycl_queue == ar1.sycl_queue + + r = dpt.clip(ar1, min=None, max=ar3) + assert isinstance(r, dpt.usm_ndarray) + assert r.dtype == ar1.dtype + assert r.shape == ar1.shape + assert dpt.all(r == ar1) + assert r.sycl_queue == ar1.sycl_queue + else: + with pytest.raises(ValueError): + dpt.clip(ar1, ar2, ar3) + with pytest.raises(ValueError): + dpt.clip(ar1, min=ar3, max=None) + with pytest.raises(ValueError): + dpt.clip(ar1, min=None, max=ar3) + + +def test_clip_empty(): + get_queue_or_skip() + + x = dpt.empty((2, 0, 3), dtype="i4") + a_min = dpt.ones((2, 0, 3), dtype="i4") + a_max = dpt.ones((2, 0, 3), dtype="i4") + + r = dpt.clip(x, a_min, a_max) + assert r.size == 0 + assert r.shape == x.shape + + +def test_clip_python_scalars(): + get_queue_or_skip() + + arrs = [ + dpt.ones(1, dtype="?"), + dpt.ones(1, dtype="i4"), + dpt.ones(1, dtype="f4"), + dpt.ones(1, dtype="c8"), + ] + + py_zeros = [ + False, + 0, + 0.0, + complex(0, 0), + ] + + py_ones = [ + True, + 1, + 1.0, + complex(1, 0), + ] + + for zero, one, arr in zip(py_zeros, py_ones, arrs): + r = dpt.clip(arr, zero, one) + assert isinstance(r, dpt.usm_ndarray) + r = dpt.clip(arr, min=zero) + assert isinstance(r, dpt.usm_ndarray) + + +def test_clip_in_place(): + get_queue_or_skip() + + x = dpt.arange(10, dtype="i4") + a_min = dpt.arange(1, 11, dtype="i4") + a_max = dpt.arange(2, 12, dtype="i4") + dpt.clip(x, a_min, a_max, out=x) + assert dpt.all(x == a_min) + + x = dpt.arange(10, dtype="i4") + dpt.clip(x, min=a_min, max=None, out=x) + assert dpt.all(x == a_min) + + x = dpt.arange(10, dtype="i4") + dpt.clip(x, a_min, a_max, out=a_max) + assert dpt.all(a_max == a_min) + + a_min = dpt.arange(1, 11, dtype="i4") + dpt.clip(x, min=a_min, max=None, out=a_min[::-1]) + assert dpt.all((x + 1)[::-1] == a_min) + + +def test_clip_special_cases(): + get_queue_or_skip() + + x = dpt.arange(10, dtype="f4") + r = dpt.clip(x, -dpt.inf, dpt.inf) + assert dpt.all(r == x) + r = dpt.clip(x, dpt.nan, dpt.inf) + assert dpt.all(dpt.isnan(r)) + r = dpt.clip(x, -dpt.inf, dpt.nan) + assert dpt.all(dpt.isnan(r)) + + +def test_clip_out_need_temporary(): + get_queue_or_skip() + + x = dpt.ones(10, dtype="i4") + a_min = dpt.asarray(2, dtype="i4") + a_max = dpt.asarray(3, dtype="i4") + dpt.clip(x[:6], 2, 3, out=x[-6:]) + assert dpt.all(x[:-6] == 1) and dpt.all(x[-6:] == 2) + + x = dpt.ones(10, dtype="i4") + a_min = dpt.asarray(2, dtype="i4") + a_max = dpt.asarray(3, dtype="i2") + dpt.clip(x[:6], 2, 3, out=x[-6:]) + assert dpt.all(x[:-6] == 1) and dpt.all(x[-6:] == 2) + + x = dpt.ones(10, dtype="i4") + a_min = dpt.asarray(2, dtype="i2") + a_max = dpt.asarray(3, dtype="i4") + dpt.clip(x[:6], 2, 3, out=x[-6:]) + assert dpt.all(x[:-6] == 1) and dpt.all(x[-6:] == 2) + + x = dpt.ones(10, dtype="i4") + a_min = dpt.asarray(2, dtype="i2") + a_max = dpt.asarray(3, dtype="i1") + dpt.clip(x[:6], 2, 3, out=x[-6:]) + assert dpt.all(x[:-6] == 1) and dpt.all(x[-6:] == 2) + + x = dpt.arange(12, dtype="i4") + dpt.clip(x[:6], out=x[-6:]) + expected = dpt.arange(6, dtype="i4") + assert dpt.all(x[:-6] == expected) and dpt.all(x[-6:] == expected) + + x = dpt.ones(10, dtype="i4") + dpt.clip(x, out=x) + assert dpt.all(x == 1) + + x = dpt.full(6, 3, dtype="i4") + a_min = dpt.full(10, 2, dtype="i4") + a_max = dpt.asarray(4, dtype="i4") + dpt.clip(x, min=a_min[:6], max=a_max, out=a_min[-6:]) + assert dpt.all(a_min[:-6] == 2) and dpt.all(a_min[-6:] == 3) + + x = dpt.full(6, 3, dtype="i4") + a_min = dpt.full(10, 2, dtype="i4") + a_max = dpt.asarray(4, dtype="i2") + dpt.clip(x, min=a_min[:6], max=a_max, out=a_min[-6:]) + assert dpt.all(a_min[:-6] == 2) and dpt.all(a_min[-6:] == 3) + + +def test_clip_out_need_temporary_none(): + get_queue_or_skip() + + x = dpt.full(6, 3, dtype="i4") + # with min/max == None + a_min = dpt.full(10, 2, dtype="i4") + dpt.clip(x, min=a_min[:6], max=None, out=a_min[-6:]) + assert dpt.all(a_min[:-6] == 2) and dpt.all(a_min[-6:] == 3) + + +def test_clip_arg_validation(): + get_queue_or_skip() + + check = {} + x1 = dpt.empty((1,), dtype="i4") + x2 = dpt.empty((1,), dtype="i4") + + with pytest.raises(TypeError): + dpt.clip(check, x1, x2) + + with pytest.raises(ValueError): + dpt.clip(x1, check, x2) + + with pytest.raises(ValueError): + dpt.clip(x1, check) + + with pytest.raises(TypeError): + dpt.clip(x1, x1, x2, out=check) + + with pytest.raises(TypeError): + dpt.clip(x1, x2, out=check) + + with pytest.raises(TypeError): + dpt.clip(x1, out=check) + + +@pytest.mark.parametrize( + "dt1,dt2", [("i4", "i4"), ("i4", "i2"), ("i2", "i4"), ("i1", "i2")] +) +def test_clip_order(dt1, dt2): + get_queue_or_skip() + + test_shape = ( + 20, + 20, + ) + test_shape2 = tuple(2 * dim for dim in test_shape) + n = test_shape[-1] + + ar1 = dpt.ones(test_shape, dtype="i4", order="C") + ar2 = dpt.ones(test_shape, dtype=dt1, order="C") + ar3 = dpt.ones(test_shape, dtype=dt2, order="C") + r1 = dpt.clip(ar1, ar2, ar3, order="C") + assert r1.flags.c_contiguous + r2 = dpt.clip(ar1, ar2, ar3, order="F") + assert r2.flags.f_contiguous + r3 = dpt.clip(ar1, ar2, ar3, order="A") + assert r3.flags.c_contiguous + r4 = dpt.clip(ar1, ar2, ar3, order="K") + assert r4.flags.c_contiguous + + ar1 = dpt.ones(test_shape, dtype="i4", order="F") + ar2 = dpt.ones(test_shape, dtype=dt1, order="F") + ar3 = dpt.ones(test_shape, dtype=dt2, order="F") + r1 = dpt.clip(ar1, ar2, ar3, order="C") + assert r1.flags.c_contiguous + r2 = dpt.clip(ar1, ar2, ar3, order="F") + assert r2.flags.f_contiguous + r3 = dpt.clip(ar1, ar2, ar3, order="A") + assert r3.flags.f_contiguous + r4 = dpt.clip(ar1, ar2, ar3, order="K") + assert r4.flags.f_contiguous + + ar1 = dpt.ones(test_shape2, dtype="i4", order="C")[:20, ::-2] + ar2 = dpt.ones(test_shape2, dtype=dt1, order="C")[:20, ::-2] + ar3 = dpt.ones(test_shape2, dtype=dt2, order="C")[:20, ::-2] + r4 = dpt.clip(ar1, ar2, ar3, order="K") + assert r4.strides == (n, -1) + r5 = dpt.clip(ar1, ar2, ar3, order="C") + assert r5.strides == (n, 1) + + ar1 = dpt.ones(test_shape2, dtype="i4", order="C")[:20, ::-2].mT + ar2 = dpt.ones(test_shape2, dtype=dt1, order="C")[:20, ::-2].mT + ar3 = dpt.ones(test_shape2, dtype=dt2, order="C")[:20, ::-2].mT + r4 = dpt.clip(ar1, ar2, ar3, order="K") + assert r4.strides == (-1, n) + r5 = dpt.clip(ar1, ar2, ar3, order="C") + assert r5.strides == (n, 1) + + +@pytest.mark.parametrize("dt", ["i4", "i2"]) +def test_clip_none_order(dt): + get_queue_or_skip() + + test_shape = ( + 20, + 20, + ) + test_shape2 = tuple(2 * dim for dim in test_shape) + n = test_shape[-1] + + ar1 = dpt.ones(test_shape, dtype="i4", order="C") + ar2 = dpt.ones(test_shape, dtype=dt, order="C") + + r1 = dpt.clip(ar1, min=None, max=ar2, order="C") + assert r1.flags.c_contiguous + r2 = dpt.clip(ar1, min=None, max=ar2, order="F") + assert r2.flags.f_contiguous + r3 = dpt.clip(ar1, min=None, max=ar2, order="A") + assert r3.flags.c_contiguous + r4 = dpt.clip(ar1, min=None, max=ar2, order="K") + assert r4.flags.c_contiguous + + ar1 = dpt.ones(test_shape, dtype="i4", order="F") + ar2 = dpt.ones(test_shape, dtype=dt, order="F") + + r1 = dpt.clip(ar1, min=None, max=ar2, order="C") + assert r1.flags.c_contiguous + r2 = dpt.clip(ar1, min=None, max=ar2, order="F") + assert r2.flags.f_contiguous + r3 = dpt.clip(ar1, min=None, max=ar2, order="A") + assert r3.flags.f_contiguous + r4 = dpt.clip(ar1, min=None, max=ar2, order="K") + assert r4.flags.f_contiguous + + ar1 = dpt.ones(test_shape2, dtype="i4", order="C")[:20, ::-2] + ar2 = dpt.ones(test_shape2, dtype=dt, order="C")[:20, ::-2] + + r4 = dpt.clip(ar1, min=None, max=ar2, order="K") + assert r4.strides == (n, -1) + r5 = dpt.clip(ar1, min=None, max=ar2, order="C") + assert r5.strides == (n, 1) + + ar1 = dpt.ones(test_shape2, dtype="i4", order="C")[:20, ::-2].mT + ar2 = dpt.ones(test_shape2, dtype=dt, order="C")[:20, ::-2].mT + + r4 = dpt.clip(ar1, min=None, max=ar2, order="K") + assert r4.strides == (-1, n) + r5 = dpt.clip(ar1, min=None, max=ar2, order="C") + assert r5.strides == (n, 1) + + +@pytest.mark.parametrize("usm_type1", _usm_types) +@pytest.mark.parametrize("usm_type2", _usm_types) +@pytest.mark.parametrize("usm_type3", _usm_types) +def test_clip_usm_type_matrix(usm_type1, usm_type2, usm_type3): + get_queue_or_skip() + + sz = 128 + ar1 = dpt.ones(sz, dtype="i4", usm_type=usm_type1) + ar2 = dpt.ones_like(ar1, dtype="i4", usm_type=usm_type2) + ar3 = dpt.ones_like(ar1, dtype="i4", usm_type=usm_type3) + + r = dpt.clip(ar1, ar2, ar3) + assert isinstance(r, dpt.usm_ndarray) + expected_usm_type = dpt.get_coerced_usm_type( + (usm_type1, usm_type2, usm_type3) + ) + assert r.usm_type == expected_usm_type + + +@pytest.mark.parametrize("usm_type1", _usm_types) +@pytest.mark.parametrize("usm_type2", _usm_types) +def test_clip_usm_type_matrix_none_arg(usm_type1, usm_type2): + get_queue_or_skip() + + sz = 128 + ar1 = dpt.ones(sz, dtype="i4", usm_type=usm_type1) + ar2 = dpt.ones_like(ar1, dtype="i4", usm_type=usm_type2) + + r = dpt.clip(ar1, min=ar2, max=None) + assert isinstance(r, dpt.usm_ndarray) + expected_usm_type = dpt.get_coerced_usm_type((usm_type1, usm_type2)) + assert r.usm_type == expected_usm_type + + +def test_clip_dtype_error(): + get_queue_or_skip() + + ar1 = dpt.ones(1, dtype="i4") + ar2 = dpt.ones(1, dtype="i4") + ar3 = dpt.ones(1, dtype="i4") + ar4 = dpt.empty_like(ar1, dtype="f4") + + assert_raises_regex( + ValueError, + "Output array of type.*is needed", + dpt.clip, + ar1, + ar2, + ar3, + ar4, + ) + assert_raises_regex( + ValueError, + "Output array of type.*is needed", + dpt.clip, + ar1, + ar2, + None, + ar4, + ) + + +def test_clip_errors(): + get_queue_or_skip() + try: + gpu_queue = dpctl.SyclQueue("gpu") + except dpctl.SyclQueueCreationError: + pytest.skip("SyclQueue('gpu') failed, skipping") + try: + cpu_queue = dpctl.SyclQueue("cpu") + except dpctl.SyclQueueCreationError: + pytest.skip("SyclQueue('cpu') failed, skipping") + + ar1 = dpt.ones(2, dtype="float32", sycl_queue=gpu_queue) + ar2 = dpt.ones_like(ar1, sycl_queue=gpu_queue) + ar3 = dpt.ones_like(ar1, sycl_queue=gpu_queue) + ar4 = dpt.empty_like(ar1, sycl_queue=cpu_queue) + assert_raises_regex( + dpt.ExecutionPlacementError, + "Input and output allocation queues are not compatible", + dpt.clip, + ar1, + ar2, + ar3, + ar4, + ) + + assert_raises_regex( + dpt.ExecutionPlacementError, + "Input and output allocation queues are not compatible", + dpt.clip, + ar1, + None, + ar3, + ar4, + ) + + assert_raises_regex( + dpt.ExecutionPlacementError, + "Execution placement can not be unambiguously inferred from input " + "arguments.", + dpt.clip, + ar1, + ar4, + ar2, + ar3, + ) + + assert_raises_regex( + dpt.ExecutionPlacementError, + "Execution placement can not be unambiguously inferred from input " + "arguments.", + dpt.clip, + ar1, + ar4, + 1, + ar3, + ) + + assert_raises_regex( + dpt.ExecutionPlacementError, + "Execution placement can not be unambiguously inferred from input " + "arguments.", + dpt.clip, + ar1, + 1, + ar4, + ar3, + ) + + assert_raises_regex( + dpt.ExecutionPlacementError, + "Execution placement can not be unambiguously inferred from input " + "arguments.", + dpt.clip, + ar1, + ar4, + None, + ar2, + ) + + ar1 = dpt.ones(2, dtype="float32") + ar2 = dpt.ones_like(ar1, dtype="float32") + ar3 = dpt.ones_like(ar1, dtype="float32") + ar4 = dpt.empty(3, dtype="float32") + assert_raises_regex( + ValueError, + "The shape of input and output arrays are inconsistent", + dpt.clip, + ar1, + ar2, + ar3, + ar4, + ) + + assert_raises_regex( + ValueError, + "The shape of input and output arrays are inconsistent", + dpt.clip, + ar1, + ar2, + None, + ar4, + ) + + ar1 = np.ones(2, dtype="f4") + ar2 = dpt.ones(2, dtype="f4") + ar3 = dpt.ones(2, dtype="f4") + assert_raises_regex( + TypeError, + "Expected `x` to be of dpnp.tensor.usm_ndarray type*", + dpt.clip, + ar1, + ar2, + ar3, + ) + + ar1 = dpt.ones(2, dtype="i4") + ar2 = dpt.ones_like(ar1, dtype="i4") + ar3 = dpt.ones_like(ar1, dtype="i4") + ar4 = np.empty(ar1.shape, dtype=ar1.dtype) + assert_raises_regex( + TypeError, + "output array must be of usm_ndarray type", + dpt.clip, + ar1, + ar2, + ar3, + ar4, + ) + + assert_raises_regex( + TypeError, + "output array must be of usm_ndarray type", + dpt.clip, + ar1, + ar2, + None, + ar4, + ) + + +def test_clip_out_type_check(): + get_queue_or_skip() + + x1 = dpt.ones(10) + x2 = dpt.ones(10) + x3 = dpt.ones(10) + + out = range(10) + + with pytest.raises(TypeError): + dpt.clip(x1, x2, x3, out=out) + + +@pytest.mark.parametrize("dt", ["i4", "f4", "c8"]) +def test_clip_basic(dt): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dt, q) + + sz = 1026 + x = dpt.arange(sz, dtype=dt, sycl_queue=q) + r = dpt.clip(x, min=100, max=500) + expected = dpt.arange(sz, dtype=dt, sycl_queue=q) + expected[:100] = 100 + expected[500:] = 500 + assert dpt.all(expected == r) + + x = dpt.zeros(sz, dtype=dt, sycl_queue=q) + a_max = dpt.full(sz, -1, dtype=dt, sycl_queue=q) + a_max[::2] = -2 + r = dpt.clip(x, min=-3, max=a_max) + assert dpt.all(a_max == r) + + +@pytest.mark.parametrize("dt", ["i4", "f4", "c8"]) +def test_clip_strided(dt): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dt, q) + + sz = 2 * 1026 + x = dpt.arange(sz, dtype=dt, sycl_queue=q)[::-2] + r = dpt.clip(x, min=100, max=500) + expected = dpt.arange(sz, dtype=dt, sycl_queue=q) + expected[:100] = 100 + expected[500:] = 500 + expected = expected[::-2] + assert dpt.all(expected == r) + + x = dpt.zeros(sz, dtype=dt, sycl_queue=q)[::-2] + a_max = dpt.full(sz, -1, dtype=dt, sycl_queue=q) + a_max[::2] = -2 + a_max = a_max[::-2] + r = dpt.clip(x, min=-3, max=a_max) + assert dpt.all(a_max == r) + + +def test_clip_max_less_than_min(): + get_queue_or_skip() + + x = dpt.ones(10, dtype="i4") + res = dpt.clip(x, 5, 0) + assert dpt.all(res == 0) + + +@pytest.mark.parametrize("dt", ["?", "i4", "f4", "c8"]) +def test_clip_minmax_weak_types(dt): + get_queue_or_skip() + + x = dpt.zeros(10, dtype=dt) + min_list = [False, 0, 0.0, 0.0 + 0.0j] + max_list = [True, 1, 1.0, 1.0 + 0.0j] + + for min_v, max_v in zip(min_list, max_list): + st_dt = _strong_dtype_num_kind(dpt.dtype(dt)) + wk_dt1 = _weak_type_num_kind(_get_dtype(min_v, x.sycl_device)) + wk_dt2 = _weak_type_num_kind(_get_dtype(max_v, x.sycl_device)) + + if st_dt >= wk_dt1 and st_dt >= wk_dt2: + r = dpt.clip(x, min_v, max_v) + assert isinstance(r, dpt.usm_ndarray) + else: + with pytest.raises(ValueError): + dpt.clip(x, min_v, max_v) + + if st_dt >= wk_dt1: + r = dpt.clip(x, min_v) + assert isinstance(r, dpt.usm_ndarray) + + r = dpt.clip(x, None, min_v) + assert isinstance(r, dpt.usm_ndarray) + else: + with pytest.raises(ValueError): + dpt.clip(x, min_v) + with pytest.raises(ValueError): + dpt.clip(x, None, max_v) + + +def test_clip_max_weak_type_errors(): + get_queue_or_skip() + + x = dpt.zeros(10, dtype="i4") + m = dpt.ones(10, dtype="i4") + + with pytest.raises(ValueError): + dpt.clip(x, m, 2.5) + + with pytest.raises(ValueError): + dpt.clip(x, 2.5, m) + + with pytest.raises(ValueError): + dpt.clip(x, 2.5) + + with pytest.raises(ValueError): + dpt.clip(dpt.astype(x, "?"), 2) + + with pytest.raises(ValueError): + dpt.clip(dpt.astype(x, "f4"), complex(2)) + + +def test_clip_unaligned(): + get_queue_or_skip() + + x = dpt.full(513, 5, dtype="i4") + a_min = dpt.zeros(512, dtype="i4") + a_max = dpt.full(512, 2, dtype="i4") + + expected = dpt.full(512, 2, dtype="i4") + assert dpt.all(dpt.clip(x[1:], a_min, a_max) == expected) + + +def test_clip_none_args(): + get_queue_or_skip() + + x = dpt.arange(10, dtype="i4") + r = dpt.clip(x) + assert dpt.all(x == r) + + +def test_clip_shape_errors(): + get_queue_or_skip() + + x = dpt.ones((4, 4), dtype="i4") + a_min = dpt.ones(5, dtype="i4") + a_max = dpt.ones(5, dtype="i4") + + with pytest.raises(ValueError): + dpt.clip(x, a_min, a_max) + + with pytest.raises(ValueError): + dpt.clip(x, a_min) + + with pytest.raises(ValueError): + dpt.clip(x, 0, 1, out=a_min) + + with pytest.raises(ValueError): + dpt.clip(x, 0, out=a_min) + + with pytest.raises(ValueError): + dpt.clip(x, out=a_min) + + +def test_clip_compute_follows_data(): + q1 = get_queue_or_skip() + q2 = get_queue_or_skip() + + x = dpt.ones(10, dtype="i4", sycl_queue=q1) + a_min = dpt.ones(10, dtype="i4", sycl_queue=q2) + a_max = dpt.ones(10, dtype="i4", sycl_queue=q1) + res = dpt.empty_like(x, sycl_queue=q2) + + with pytest.raises(dpt.ExecutionPlacementError): + dpt.clip(x, a_min, a_max) + + with pytest.raises(dpt.ExecutionPlacementError): + dpt.clip(x, dpt.ones_like(x), a_max, out=res) + + with pytest.raises(dpt.ExecutionPlacementError): + dpt.clip(x, a_min) + + with pytest.raises(dpt.ExecutionPlacementError): + dpt.clip(x, None, a_max, out=res) + + with pytest.raises(dpt.ExecutionPlacementError): + dpt.clip(x, out=res) + + +def test_clip_readonly_out(): + get_queue_or_skip() + x = dpt.arange(32, dtype=dpt.int32) + r = dpt.empty_like(x) + r.flags["W"] = False + + with pytest.raises(ValueError): + dpt.clip(x, min=0, max=10, out=r) + + with pytest.raises(ValueError): + dpt.clip(x, max=10, out=r) + + with pytest.raises(ValueError): + dpt.clip(x, min=0, out=r) + + with pytest.raises(ValueError): + dpt.clip(x, out=r) + + +def test_clip_gh_1744(): + get_queue_or_skip() + x = dpt.asarray([0, 255], dtype=dpt.uint8) + y = dpt.clip(x, -300, 300) + + assert dpt.all(x == y) diff --git a/dpnp/tests/tensor/test_tensor_copy_utils.py b/dpnp/tests/tensor/test_tensor_copy_utils.py new file mode 100644 index 000000000000..878877dcaa4c --- /dev/null +++ b/dpnp/tests/tensor/test_tensor_copy_utils.py @@ -0,0 +1,113 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import numpy as np +import pytest + +import dpnp.tensor as dpt +import dpnp.tensor._copy_utils as cu + +from .helper import get_queue_or_skip + + +def test_copy_utils_empty_like_orderK(): + get_queue_or_skip() + a = dpt.empty((10, 10), dtype=dpt.int32, order="F") + X = cu._empty_like_orderK(a, dpt.int32, a.usm_type, a.device) + assert X.flags["F"] + + +def test_copy_utils_empty_like_orderK_invalid_args(): + get_queue_or_skip() + with pytest.raises(TypeError): + cu._empty_like_orderK([1, 2, 3], dpt.int32, "device", None) + with pytest.raises(TypeError): + cu._empty_like_pair_orderK( + [1, 2, 3], + ( + 1, + 2, + 3, + ), + dpt.int32, + (3,), + "device", + None, + ) + + a = dpt.empty(10, dtype=dpt.int32) + with pytest.raises(TypeError): + cu._empty_like_pair_orderK( + a, + ( + 1, + 2, + 3, + ), + dpt.int32, + (10,), + "device", + None, + ) + + +def test_copy_utils_from_numpy_empty_like_orderK(): + q = get_queue_or_skip() + + a = np.empty((10, 10), dtype=np.int32, order="C") + r0 = cu._from_numpy_empty_like_orderK(a, dpt.int32, "device", q) + assert r0.flags["C"] + + b = np.empty((10, 10), dtype=np.int32, order="F") + r1 = cu._from_numpy_empty_like_orderK(b, dpt.int32, "device", q) + assert r1.flags["F"] + + c = np.empty((2, 3, 4), dtype=np.int32, order="C") + c = np.transpose(c, (1, 0, 2)) + r2 = cu._from_numpy_empty_like_orderK(c, dpt.int32, "device", q) + assert not r2.flags["C"] and not r2.flags["F"] + + +def test_copy_utils_from_numpy_empty_like_orderK_invalid_args(): + with pytest.raises(TypeError): + cu._from_numpy_empty_like_orderK([1, 2, 3], dpt.int32, "device", None) + + +def test_gh_2055(): + """ + Test that `dpt.asarray` works on contiguous NumPy arrays with `order="K"` + when dimensions are permuted. + + See: https://github.com/IntelPython/dpctl/issues/2055 + """ + get_queue_or_skip() + + a = np.ones((2, 3, 4), dtype=dpt.int32) + a_t = np.transpose(a, (2, 0, 1)) + r = dpt.asarray(a_t) + assert not r.flags["C"] and not r.flags["F"] diff --git a/dpnp/tests/tensor/test_tensor_diff.py b/dpnp/tests/tensor/test_tensor_diff.py new file mode 100644 index 000000000000..f75b9d4a3639 --- /dev/null +++ b/dpnp/tests/tensor/test_tensor_diff.py @@ -0,0 +1,344 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +from math import prod + +import pytest +from numpy.testing import assert_raises_regex + +import dpnp.tensor as dpt +from dpnp.tensor._type_utils import _to_device_supported_dtype + +from .helper import ( + get_queue_or_skip, + skip_if_dtype_not_supported, +) + +_all_dtypes = [ + "?", + "i1", + "u1", + "i2", + "u2", + "i4", + "u4", + "i8", + "u8", + "f2", + "f4", + "f8", + "c8", + "c16", +] + + +@pytest.mark.parametrize("dt", _all_dtypes) +def test_diff_basic(dt): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dt, q) + + x = dpt.asarray([9, 12, 7, 17, 10, 18, 15, 9, 8, 8], dtype=dt, sycl_queue=q) + op = dpt.not_equal if x.dtype is dpt.bool else dpt.subtract + + # test both n=2 and n>2 branches + for n in [1, 2, 5]: + res = dpt.diff(x, n=n) + expected_res = x + for _ in range(n): + expected_res = op(expected_res[1:], expected_res[:-1]) + if dpt.dtype(dt).kind in "fc": + assert dpt.allclose(res, expected_res) + else: + assert dpt.all(res == expected_res) + + +def test_diff_axis(): + get_queue_or_skip() + + x = dpt.tile( + dpt.asarray([9, 12, 7, 17, 10, 18, 15, 9, 8, 8], dtype="i4"), (3, 4, 1) + ) + x[:, ::2, :] = 0 + + for n in [1, 2, 3]: + res = dpt.diff(x, n=n, axis=1) + expected_res = x + for _ in range(n): + expected_res = dpt.subtract( + expected_res[:, 1:, :], expected_res[:, :-1, :] + ) + assert dpt.all(res == expected_res) + + +def test_diff_prepend_append_type_promotion(): + get_queue_or_skip() + + dts = [ + ("i1", "u1", "i8"), + ("i1", "u8", "u1"), + ("u4", "i4", "f4"), + ("i8", "c8", "u8"), + ] + + for dt0, dt1, dt2 in dts: + x = dpt.ones(10, dtype=dt1) + prepend = dpt.full(1, 2, dtype=dt0) + append = dpt.full(1, 3, dtype=dt2) + + res = dpt.diff(x, prepend=prepend, append=append) + assert res.dtype == _to_device_supported_dtype( + dpt.result_type(prepend, x, append), + x.sycl_queue.sycl_device, + ) + + res = dpt.diff(x, prepend=prepend) + assert res.dtype == _to_device_supported_dtype( + dpt.result_type(prepend, x), + x.sycl_queue.sycl_device, + ) + + res = dpt.diff(x, append=append) + assert res.dtype == _to_device_supported_dtype( + dpt.result_type(x, append), + x.sycl_queue.sycl_device, + ) + + +def test_diff_0d(): + get_queue_or_skip() + + x = dpt.ones(()) + with pytest.raises(ValueError): + dpt.diff(x) + + +def test_diff_empty_array(): + get_queue_or_skip() + + x = dpt.ones((3, 0, 5)) + res = dpt.diff(x, axis=1) + assert res.shape == x.shape + + res = dpt.diff(x, axis=0) + assert res.shape == (2, 0, 5) + + append = dpt.ones((3, 2, 5)) + res = dpt.diff(x, axis=1, append=append) + assert res.shape == (3, 1, 5) + + prepend = dpt.ones((3, 2, 5)) + res = dpt.diff(x, axis=1, prepend=prepend) + assert res.shape == (3, 1, 5) + + +def test_diff_no_op(): + get_queue_or_skip() + + x = dpt.ones(10, dtype="i4") + res = dpt.diff(x, n=0) + assert dpt.all(x == res) + + x = dpt.reshape(x, (2, 5)) + res = dpt.diff(x, n=0, axis=0) + assert dpt.all(x == res) + + +@pytest.mark.parametrize("sh,axis", [((1,), 0), ((3, 4, 5), 1)]) +def test_diff_prepend_append_py_scalars(sh, axis): + get_queue_or_skip() + + n = 1 + + arr = dpt.ones(sh, dtype="i4") + zero = 0 + + # first and last elements along axis + # will be checked for correctness + sl1 = [slice(None)] * arr.ndim + sl1[axis] = slice(1) + sl1 = tuple(sl1) + + sl2 = [slice(None)] * arr.ndim + sl2[axis] = slice(-1, None, None) + sl2 = tuple(sl2) + + r = dpt.diff(arr, axis=axis, prepend=zero, append=zero) + assert all(r.shape[i] == arr.shape[i] for i in range(arr.ndim) if i != axis) + assert r.shape[axis] == arr.shape[axis] + 2 - n + assert dpt.all(r[sl1] == 1) + assert dpt.all(r[sl2] == -1) + + r = dpt.diff(arr, axis=axis, prepend=zero) + assert all(r.shape[i] == arr.shape[i] for i in range(arr.ndim) if i != axis) + assert r.shape[axis] == arr.shape[axis] + 1 - n + assert dpt.all(r[sl1] == 1) + + r = dpt.diff(arr, axis=axis, append=zero) + assert all(r.shape[i] == arr.shape[i] for i in range(arr.ndim) if i != axis) + assert r.shape[axis] == arr.shape[axis] + 1 - n + assert dpt.all(r[sl2] == -1) + + r = dpt.diff(arr, axis=axis, prepend=dpt.asarray(zero), append=zero) + assert all(r.shape[i] == arr.shape[i] for i in range(arr.ndim) if i != axis) + assert r.shape[axis] == arr.shape[axis] + 2 - n + assert dpt.all(r[sl1] == 1) + assert dpt.all(r[sl2] == -1) + + r = dpt.diff(arr, axis=axis, prepend=zero, append=dpt.asarray(zero)) + assert all(r.shape[i] == arr.shape[i] for i in range(arr.ndim) if i != axis) + assert r.shape[axis] == arr.shape[axis] + 2 - n + assert dpt.all(r[sl1] == 1) + assert dpt.all(r[sl2] == -1) + + +def test_tensor_diff_append_prepend_arrays(): + get_queue_or_skip() + + n = 1 + axis = 0 + + for sh in [(5,), (3, 4, 5)]: + sz = prod(sh) + arr = dpt.reshape(dpt.arange(sz, 2 * sz, dtype="i4"), sh) + prepend = dpt.reshape(dpt.arange(sz, dtype="i4"), sh) + append = dpt.reshape(dpt.arange(2 * sz, 3 * sz, dtype="i4"), sh) + const_diff = sz / sh[axis] + + r = dpt.diff(arr, axis=axis, prepend=prepend, append=append) + assert all( + r.shape[i] == arr.shape[i] for i in range(arr.ndim) if i != axis + ) + assert ( + r.shape[axis] + == arr.shape[axis] + prepend.shape[axis] + append.shape[axis] - n + ) + assert dpt.all(r == const_diff) + + r = dpt.diff(arr, axis=axis, prepend=prepend) + assert all( + r.shape[i] == arr.shape[i] for i in range(arr.ndim) if i != axis + ) + assert r.shape[axis] == arr.shape[axis] + prepend.shape[axis] - n + assert dpt.all(r == const_diff) + + r = dpt.diff(arr, axis=axis, append=append) + assert all( + r.shape[i] == arr.shape[i] for i in range(arr.ndim) if i != axis + ) + assert r.shape[axis] == arr.shape[axis] + append.shape[axis] - n + assert dpt.all(r == const_diff) + + +def test_diff_wrong_append_prepend_shape(): + get_queue_or_skip() + + arr = dpt.ones((3, 4, 5), dtype="i4") + arr_bad_sh = dpt.ones(2, dtype="i4") + + assert_raises_regex( + ValueError, + ".*shape.*is invalid.*", + dpt.diff, + arr, + prepend=arr_bad_sh, + append=arr_bad_sh, + ) + + assert_raises_regex( + ValueError, + ".*shape.*is invalid.*", + dpt.diff, + arr, + prepend=arr, + append=arr_bad_sh, + ) + + assert_raises_regex( + ValueError, + ".*shape.*is invalid.*", + dpt.diff, + arr, + prepend=arr_bad_sh, + ) + + assert_raises_regex( + ValueError, + ".*shape.*is invalid.*", + dpt.diff, + arr, + append=arr_bad_sh, + ) + + +def test_diff_compute_follows_data(): + q1 = get_queue_or_skip() + q2 = get_queue_or_skip() + q3 = get_queue_or_skip() + + ar1 = dpt.ones(1, dtype="i4", sycl_queue=q1) + ar2 = dpt.ones(1, dtype="i4", sycl_queue=q2) + ar3 = dpt.ones(1, dtype="i4", sycl_queue=q3) + + with pytest.raises(dpt.ExecutionPlacementError): + dpt.diff(ar1, prepend=ar2, append=ar3) + + with pytest.raises(dpt.ExecutionPlacementError): + dpt.diff(ar1, prepend=ar2, append=0) + + with pytest.raises(dpt.ExecutionPlacementError): + dpt.diff(ar1, prepend=0, append=ar2) + + with pytest.raises(dpt.ExecutionPlacementError): + dpt.diff(ar1, prepend=ar2) + + with pytest.raises(dpt.ExecutionPlacementError): + dpt.diff(ar1, append=ar2) + + +def test_diff_input_validation(): + bad_in = {} + assert_raises_regex( + TypeError, + "Expecting dpnp.tensor.usm_ndarray type, got.*", + dpt.diff, + bad_in, + ) + + +def test_diff_positive_order(): + get_queue_or_skip() + + x = dpt.ones(1, dtype="i4") + n = -1 + assert_raises_regex( + ValueError, + ".*must be positive.*", + dpt.diff, + x, + n=n, + ) diff --git a/dpnp/tests/tensor/test_tensor_dtype_routines.py b/dpnp/tests/tensor/test_tensor_dtype_routines.py new file mode 100644 index 000000000000..588926c0d123 --- /dev/null +++ b/dpnp/tests/tensor/test_tensor_dtype_routines.py @@ -0,0 +1,170 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + + +import dpctl +import pytest + +import dpnp.tensor as dpt + +list_dtypes = [ + "bool", + "int8", + "int16", + "int32", + "int64", + "uint8", + "uint16", + "uint32", + "uint64", + "float16", + "float32", + "float64", + "complex64", + "complex128", +] + + +dtype_categories = { + "bool": ["bool"], + "signed integer": ["int8", "int16", "int32", "int64"], + "unsigned integer": ["uint8", "uint16", "uint32", "uint64"], + "integral": [ + "int8", + "int16", + "int32", + "int64", + "uint8", + "uint16", + "uint32", + "uint64", + ], + "real floating": ["float16", "float32", "float64"], + "complex floating": ["complex64", "complex128"], + "numeric": [d for d in list_dtypes if d != "bool"], +} + + +@pytest.mark.parametrize("kind_str", dtype_categories.keys()) +@pytest.mark.parametrize("dtype_str", list_dtypes) +def test_isdtype_kind_str(dtype_str, kind_str): + dt = dpt.dtype(dtype_str) + is_in_kind = dpt.isdtype(dt, kind_str) + expected = dtype_str in dtype_categories[kind_str] + assert is_in_kind == expected + + +@pytest.mark.parametrize("dtype_str", list_dtypes) +def test_isdtype_kind_tuple(dtype_str): + dt = dpt.dtype(dtype_str) + if dtype_str.startswith("bool"): + assert dpt.isdtype(dt, ("real floating", "bool")) + assert not dpt.isdtype( + dt, ("integral", "real floating", "complex floating") + ) + elif dtype_str.startswith("int"): + assert dpt.isdtype(dt, ("real floating", "signed integer")) + assert not dpt.isdtype( + dt, ("bool", "unsigned integer", "real floating") + ) + elif dtype_str.startswith("uint"): + assert dpt.isdtype(dt, ("bool", "unsigned integer")) + assert not dpt.isdtype(dt, ("real floating", "complex floating")) + elif dtype_str.startswith("float"): + assert dpt.isdtype(dt, ("complex floating", "real floating")) + assert not dpt.isdtype(dt, ("integral", "complex floating", "bool")) + else: + assert dpt.isdtype(dt, ("integral", "complex floating")) + assert not dpt.isdtype(dt, ("bool", "integral", "real floating")) + + +@pytest.mark.parametrize("dtype_str", list_dtypes) +def test_isdtype_kind_tuple_dtypes(dtype_str): + dt = dpt.dtype(dtype_str) + if dtype_str.startswith("bool"): + assert dpt.isdtype(dt, (dpt.int32, dpt.bool)) + assert not dpt.isdtype(dt, (dpt.int16, dpt.uint32, dpt.float64)) + + elif dtype_str.startswith("int"): + assert dpt.isdtype(dt, (dpt.int8, dpt.int16, dpt.int32, dpt.int64)) + assert not dpt.isdtype(dt, (dpt.bool, dpt.float32, dpt.complex64)) + + elif dtype_str.startswith("uint"): + assert dpt.isdtype(dt, (dpt.uint8, dpt.uint16, dpt.uint32, dpt.uint64)) + assert not dpt.isdtype(dt, (dpt.bool, dpt.int32, dpt.float32)) + + elif dtype_str.startswith("float"): + assert dpt.isdtype(dt, (dpt.float16, dpt.float32, dpt.float64)) + assert not dpt.isdtype(dt, (dpt.bool, dpt.complex64, dpt.int8)) + + else: + assert dpt.isdtype(dt, (dpt.complex64, dpt.complex128)) + assert not dpt.isdtype(dt, (dpt.bool, dpt.uint64, dpt.int8)) + + +@pytest.mark.parametrize( + "kind", + [ + [dpt.int32, dpt.bool], + "f4", + float, + 123, + "complex", + ], +) +def test_isdtype_invalid_kind(kind): + with pytest.raises((TypeError, ValueError)): + dpt.isdtype(dpt.int32, kind) + + +def test_finfo_array(): + try: + x = dpt.empty(tuple(), dtype="f4") + except dpctl.SyclDeviceCreationError: + pytest.skip("Default-selected SYCL device unavailable") + o = dpt.finfo(x) + assert o.dtype == dpt.float32 + + +def test_iinfo_array(): + try: + x = dpt.empty(tuple(), dtype="i4") + except dpctl.SyclDeviceCreationError: + pytest.skip("Default-selected SYCL device unavailable") + o = dpt.iinfo(x) + assert o.dtype == dpt.int32 + + +def test_iinfo_validation(): + with pytest.raises(ValueError): + dpt.iinfo("O") + + +def test_finfo_validation(): + with pytest.raises(ValueError): + dpt.iinfo("O") diff --git a/dpnp/tests/tensor/test_tensor_isin.py b/dpnp/tests/tensor/test_tensor_isin.py new file mode 100644 index 000000000000..08f1787f733f --- /dev/null +++ b/dpnp/tests/tensor/test_tensor_isin.py @@ -0,0 +1,281 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import ctypes + +import numpy as np +import pytest + +import dpnp.tensor as dpt + +from .helper import ( + get_queue_or_skip, + skip_if_dtype_not_supported, +) + +_numeric_dtypes = [ + "i1", + "u1", + "i2", + "u2", + "i4", + "u4", + "i8", + "u8", + "f2", + "f4", + "f8", + "c8", + "c16", +] + +_all_dtypes = ["?"] + _numeric_dtypes + + +@pytest.mark.parametrize("dtype", _numeric_dtypes) +def test_isin_basic(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + n = 100 + x = dpt.arange(n, dtype=dtype, sycl_queue=q) + test = dpt.arange(n - 1, dtype=dtype, sycl_queue=q) + r1 = dpt.isin(x, test) + assert dpt.all(r1[:-1]) + assert not r1[-1] + assert r1.shape == x.shape + + # test with invert keyword + r2 = dpt.isin(x, test, invert=True) + assert not dpt.any(r2[:-1]) + assert r2[-1] + assert r2.shape == x.shape + + +def test_isin_basic_bool(): + dt = dpt.bool + n = 100 + x = dpt.zeros(n, dtype=dt) + x[-1] = True + test = dpt.zeros((), dtype=dt) + r1 = dpt.isin(x, test) + assert dpt.all(r1[:-1]) + assert not r1[-1] + assert r1.shape == x.shape + + r2 = dpt.isin(x, test, invert=True) + assert not dpt.any(r2[:-1]) + assert r2[-1] + assert r2.shape == x.shape + + +@pytest.mark.parametrize( + "dtype", + [ + "i1", + "u1", + "i2", + "u2", + "i4", + "u4", + "i8", + "u8", + "f2", + "f4", + "f8", + "c8", + "c16", + ], +) +def test_isin_strided(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + n, m = 100, 20 + x = dpt.zeros((n, m), dtype=dtype, order="F", sycl_queue=q) + x[:, ::2] = dpt.arange(1, (m / 2) + 1, dtype=dtype, sycl_queue=q) + x_s = x[:, ::2] + test = dpt.arange(1, (m / 2), dtype=dtype, sycl_queue=q) + r1 = dpt.isin(x_s, test) + assert dpt.all(r1[:, :-1]) + assert not dpt.any(r1[:, -1]) + assert not dpt.any(x[:, 1::2]) + assert r1.shape == x_s.shape + assert r1.flags.c_contiguous + + # test with invert keyword + r2 = dpt.isin(x_s, test, invert=True) + assert not dpt.any(r2[:, :-1]) + assert dpt.all(r2[:, -1]) + assert not dpt.any(x[:, 1:2]) + assert r2.shape == x_s.shape + assert r2.flags.c_contiguous + + +def test_isin_strided_bool(): + dt = dpt.bool + + n, m = 100, 20 + x = dpt.zeros((n, m), dtype=dt, order="F") + x[:, :-2:2] = True + x_s = x[:, ::2] + test = dpt.ones((), dtype=dt) + r1 = dpt.isin(x_s, test) + assert dpt.all(r1[:, :-1]) + assert not dpt.any(r1[:, -1]) + assert not dpt.any(x[:, 1::2]) + assert r1.shape == x_s.shape + assert r1.flags.c_contiguous + + # test with invert keyword + r2 = dpt.isin(x_s, test, invert=True) + assert not dpt.any(r2[:, :-1]) + assert dpt.all(r2[:, -1]) + assert not dpt.any(x[:, 1:2]) + assert r2.shape == x_s.shape + assert r2.flags.c_contiguous + + +@pytest.mark.parametrize("dt1", _numeric_dtypes) +@pytest.mark.parametrize("dt2", _numeric_dtypes) +def test_isin_dtype_matrix(dt1, dt2): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dt1, q) + skip_if_dtype_not_supported(dt2, q) + + sz = 10 + x = dpt.asarray([0, 1, 11], dtype=dt1, sycl_queue=q) + test1 = dpt.arange(sz, dtype=dt2, sycl_queue=q) + + r1 = dpt.isin(x, test1) + assert isinstance(r1, dpt.usm_ndarray) + assert r1.dtype == dpt.bool + assert r1.shape == x.shape + assert not r1[-1] + assert dpt.all(r1[0:-1]) + assert r1.sycl_queue == x.sycl_queue + + test2 = dpt.tile(dpt.asarray([[0, 1]], dtype=dt2, sycl_queue=q).mT, 2) + r2 = dpt.isin(x, test2) + assert isinstance(r2, dpt.usm_ndarray) + assert r2.dtype == dpt.bool + assert r2.shape == x.shape + assert not r2[-1] + assert dpt.all(r1[0:-1]) + assert r2.sycl_queue == x.sycl_queue + + +def test_isin_empty_inputs(): + get_queue_or_skip() + + x = dpt.ones((10, 0, 1), dtype="i4") + test = dpt.ones((), dtype="i4") + res1 = dpt.isin(x, test) + assert isinstance(res1, dpt.usm_ndarray) + assert res1.size == 0 + assert res1.shape == x.shape + assert res1.dtype == dpt.bool + + res2 = dpt.isin(x, test, invert=True) + assert isinstance(res2, dpt.usm_ndarray) + assert res2.size == 0 + assert res2.shape == x.shape + assert res2.dtype == dpt.bool + + x = dpt.ones((3, 3), dtype="i4") + test = dpt.ones(0, dtype="i4") + res3 = dpt.isin(x, test) + assert isinstance(res3, dpt.usm_ndarray) + assert res3.shape == x.shape + assert res3.dtype == dpt.bool + assert not dpt.all(res3) + + res4 = dpt.isin(x, test, invert=True) + assert isinstance(res4, dpt.usm_ndarray) + assert res4.shape == x.shape + assert res4.dtype == dpt.bool + assert dpt.all(res4) + + +def test_isin_validation(): + get_queue_or_skip() + with pytest.raises(dpt.ExecutionPlacementError): + dpt.isin(1, 1) + not_bool = {} + with pytest.raises(TypeError): + dpt.isin(dpt.ones([1]), dpt.ones([1]), invert=not_bool) + + +def test_isin_special_floating_point_vals(): + get_queue_or_skip() + + # real and complex nans compare false + x = dpt.asarray(dpt.nan, dtype="f4") + test = dpt.asarray(dpt.nan, dtype="f4") + assert not dpt.isin(x, test) + + x = dpt.asarray(dpt.nan, dtype="c8") + test = dpt.asarray(dpt.nan, dtype="c8") + assert not dpt.isin(x, test) + + # -0.0 compares equal to +0.0 + x = dpt.asarray(-0.0, dtype="f4") + test = dpt.asarray(0.0, dtype="f4") + assert dpt.isin(x, test) + assert dpt.isin(test, x) + + +@pytest.mark.parametrize("dt", _all_dtypes) +def test_isin_py_scalars(dt): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dt, q) + + x = dpt.zeros((10, 10), dtype=dt, sycl_queue=q) + py_zeros = ( + bool(0), + int(0), + float(0), + complex(0), + np.float32(0), + ctypes.c_int(0), + ) + for sc in py_zeros: + r1 = dpt.isin(x, sc) + assert isinstance(r1, dpt.usm_ndarray) + r2 = dpt.isin(sc, x) + assert isinstance(r2, dpt.usm_ndarray) + + +def test_isin_compute_follows_data(): + q1 = get_queue_or_skip() + q2 = get_queue_or_skip() + + x = dpt.ones(10, sycl_queue=q1) + test = dpt.ones_like(x, sycl_queue=q2) + with pytest.raises(dpt.ExecutionPlacementError): + dpt.isin(x, test) diff --git a/dpnp/tests/tensor/test_tensor_statistical_functions.py b/dpnp/tests/tensor/test_tensor_statistical_functions.py new file mode 100644 index 000000000000..7e444500d75f --- /dev/null +++ b/dpnp/tests/tensor/test_tensor_statistical_functions.py @@ -0,0 +1,271 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import pytest + +import dpnp.tensor as dpt +from dpnp.tensor._tensor_impl import default_device_fp_type + +from .helper import ( + get_queue_or_skip, + skip_if_dtype_not_supported, +) + +_no_complex_dtypes = [ + "?", + "i1", + "u1", + "i2", + "u2", + "i4", + "u4", + "i8", + "u8", + "f2", + "f4", + "f8", +] + + +@pytest.mark.parametrize("dt", _no_complex_dtypes) +def test_mean_dtypes(dt): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dt, q) + + x = dpt.ones(10, dtype=dt) + res = dpt.mean(x) + assert res == 1 + if x.dtype.kind in "biu": + assert res.dtype == dpt.dtype(default_device_fp_type(q)) + else: + assert res.dtype == x.dtype + + +@pytest.mark.parametrize("dt", _no_complex_dtypes) +@pytest.mark.parametrize("py_zero", [float(0), int(0)]) +def test_std_var_dtypes(dt, py_zero): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dt, q) + + x = dpt.ones(10, dtype=dt) + res = dpt.std(x, correction=py_zero) + assert res == 0 + if x.dtype.kind in "biu": + assert res.dtype == dpt.dtype(default_device_fp_type(q)) + else: + assert res.dtype == x.dtype + + res = dpt.var(x, correction=py_zero) + assert res == 0 + if x.dtype.kind in "biu": + assert res.dtype == dpt.dtype(default_device_fp_type(q)) + else: + assert res.dtype == x.dtype + + +def test_stat_fns_axis(): + get_queue_or_skip() + + x = dpt.ones((3, 4, 5, 6, 7), dtype="f4") + m = dpt.mean(x, axis=(1, 2, -1)) + + assert isinstance(m, dpt.usm_ndarray) + assert m.shape == (3, 6) + assert dpt.allclose(m, dpt.asarray(1, dtype=m.dtype)) + + s = dpt.var(x, axis=(1, 2, -1)) + assert isinstance(s, dpt.usm_ndarray) + assert s.shape == (3, 6) + assert dpt.allclose(s, dpt.asarray(0, dtype=s.dtype)) + + +@pytest.mark.parametrize("fn", [dpt.mean, dpt.var]) +def test_stat_fns_empty(fn): + get_queue_or_skip() + x = dpt.empty((0,), dtype="f4") + r = fn(x) + assert r.shape == () + assert dpt.isnan(r) + + x = dpt.empty((10, 0, 2), dtype="f4") + r = fn(x, axis=1) + assert r.shape == (10, 2) + assert dpt.all(dpt.isnan(r)) + + r = fn(x, axis=0) + assert r.shape == (0, 2) + assert r.size == 0 + + +def test_stat_fns_keepdims(): + get_queue_or_skip() + + x = dpt.ones((3, 4, 5, 6, 7), dtype="f4") + m = dpt.mean(x, axis=(1, 2, -1), keepdims=True) + + assert isinstance(m, dpt.usm_ndarray) + assert m.shape == (3, 1, 1, 6, 1) + assert dpt.allclose(m, dpt.asarray(1, dtype=m.dtype)) + + s = dpt.var(x, axis=(1, 2, -1), keepdims=True) + assert isinstance(s, dpt.usm_ndarray) + assert s.shape == (3, 1, 1, 6, 1) + assert dpt.allclose(s, dpt.asarray(0, dtype=s.dtype)) + + +def test_stat_fns_empty_axis(): + get_queue_or_skip() + + x = dpt.reshape(dpt.arange(3 * 4 * 5, dtype="f4"), (3, 4, 5)) + m = dpt.mean(x, axis=()) + + assert x.shape == m.shape + assert dpt.all(x == m) + + s = dpt.var(x, axis=()) + assert x.shape == s.shape + assert dpt.all(s == 0) + + d = dpt.std(x, axis=()) + assert x.shape == d.shape + assert dpt.all(d == 0) + + +def test_mean(): + get_queue_or_skip() + + x = dpt.reshape(dpt.arange(9, dtype="f4"), (3, 3)) + m = dpt.mean(x) + expected = dpt.asarray(4, dtype="f4") + assert dpt.allclose(m, expected) + + m = dpt.mean(x, axis=0) + expected = dpt.arange(3, 6, dtype="f4") + assert dpt.allclose(m, expected) + + m = dpt.mean(x, axis=1) + expected = dpt.asarray([1, 4, 7], dtype="f4") + assert dpt.allclose(m, expected) + + +def test_var_std(): + get_queue_or_skip() + + x = dpt.reshape(dpt.arange(9, dtype="f4"), (3, 3)) + r = dpt.var(x) + expected = dpt.asarray(6.666666507720947, dtype="f4") + assert dpt.allclose(r, expected) + + r1 = dpt.var(x, correction=3) + expected1 = dpt.asarray(10.0, dtype="f4") + assert dpt.allclose(r1, expected1) + + r = dpt.std(x) + expected = dpt.sqrt(expected) + assert dpt.allclose(r, expected) + + r1 = dpt.std(x, correction=3) + expected1 = dpt.sqrt(expected1) + assert dpt.allclose(r1, expected1) + + r = dpt.var(x, axis=0) + expected = dpt.full(x.shape[1], 6, dtype="f4") + assert dpt.allclose(r, expected) + + r1 = dpt.var(x, axis=0, correction=1) + expected1 = dpt.full(x.shape[1], 9, dtype="f4") + assert dpt.allclose(r1, expected1) + + r = dpt.std(x, axis=0) + expected = dpt.sqrt(expected) + assert dpt.allclose(r, expected) + + r1 = dpt.std(x, axis=0, correction=1) + expected1 = dpt.sqrt(expected1) + assert dpt.allclose(r1, expected1) + + r = dpt.var(x, axis=1) + expected = dpt.full(x.shape[0], 0.6666666865348816, dtype="f4") + assert dpt.allclose(r, expected) + + r1 = dpt.var(x, axis=1, correction=1) + expected1 = dpt.ones(x.shape[0], dtype="f4") + assert dpt.allclose(r1, expected1) + + r = dpt.std(x, axis=1) + expected = dpt.sqrt(expected) + assert dpt.allclose(r, expected) + + r1 = dpt.std(x, axis=1, correction=1) + expected1 = dpt.sqrt(expected1) + assert dpt.allclose(r1, expected1) + + +def test_var_axis_length_correction(): + get_queue_or_skip() + + x = dpt.reshape(dpt.arange(9, dtype="f4"), (3, 3)) + + r = dpt.var(x, correction=x.size) + assert dpt.isnan(r) + + r = dpt.var(x, axis=0, correction=x.shape[0]) + assert dpt.all(dpt.isnan(r)) + + r = dpt.var(x, axis=1, correction=x.shape[1]) + assert dpt.all(dpt.isnan(r)) + + +def test_stat_function_errors(): + d = {} + with pytest.raises(TypeError): + dpt.var(d) + with pytest.raises(TypeError): + dpt.std(d) + with pytest.raises(TypeError): + dpt.mean(d) + + get_queue_or_skip() + x = dpt.empty(1, dtype="f4") + with pytest.raises(TypeError): + dpt.var(x, axis=d) + with pytest.raises(TypeError): + dpt.std(x, axis=d) + with pytest.raises(TypeError): + dpt.mean(x, axis=d) + + with pytest.raises(TypeError): + dpt.var(x, correction=d) + with pytest.raises(TypeError): + dpt.std(x, correction=d) + + x = dpt.empty(1, dtype="c8") + with pytest.raises(ValueError): + dpt.var(x) + with pytest.raises(ValueError): + dpt.std(x) diff --git a/dpnp/tests/tensor/test_tensor_sum.py b/dpnp/tests/tensor/test_tensor_sum.py new file mode 100644 index 000000000000..90e548f1b28c --- /dev/null +++ b/dpnp/tests/tensor/test_tensor_sum.py @@ -0,0 +1,348 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import pytest + +import dpnp.tensor as dpt + +from .helper import ( + get_queue_or_skip, + skip_if_dtype_not_supported, +) + +_all_dtypes = [ + "?", + "i1", + "u1", + "i2", + "u2", + "i4", + "u4", + "i8", + "u8", + "f2", + "f4", + "f8", + "c8", + "c16", +] + + +@pytest.mark.parametrize("arg_dtype", _all_dtypes) +def test_sum_arg_dtype_default_output_dtype_matrix(arg_dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(arg_dtype, q) + + # test reduction for C-contiguous input + m = dpt.ones(100, dtype=arg_dtype) + r = dpt.sum(m) + + assert isinstance(r, dpt.usm_ndarray) + if m.dtype.kind == "i": + assert r.dtype.kind == "i" + elif m.dtype.kind == "u": + assert r.dtype.kind == "u" + elif m.dtype.kind == "f": + assert r.dtype.kind == "f" + elif m.dtype.kind == "c": + assert r.dtype.kind == "c" + + assert dpt.all(r == 100) + + # test reduction for strided input + m = dpt.ones(200, dtype=arg_dtype)[:1:-2] + r = dpt.sum(m) + assert dpt.all(r == 99) + + # test reduction for strided input which can be simplified + # to contiguous computation + m = dpt.ones(100, dtype=arg_dtype) + r = dpt.sum(dpt.flip(m)) + assert dpt.all(r == 100) + + +@pytest.mark.parametrize("arg_dtype", _all_dtypes) +@pytest.mark.parametrize("out_dtype", _all_dtypes[1:]) +def test_sum_arg_out_dtype_matrix(arg_dtype, out_dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(arg_dtype, q) + skip_if_dtype_not_supported(out_dtype, q) + + m = dpt.ones(100, dtype=arg_dtype) + r = dpt.sum(m, dtype=out_dtype) + + assert isinstance(r, dpt.usm_ndarray) + assert r.dtype == dpt.dtype(out_dtype) + assert dpt.all(r == 100) + + +def test_sum_empty(): + get_queue_or_skip() + x = dpt.empty((0,), dtype="u1") + y = dpt.sum(x) + assert y.shape == () + assert int(y) == 0 + + +def test_sum_axis(): + get_queue_or_skip() + + m = dpt.ones((3, 4, 5, 6, 7), dtype="i4") + s = dpt.sum(m, axis=(1, 2, -1)) + + assert isinstance(s, dpt.usm_ndarray) + assert s.shape == (3, 6) + assert dpt.all(s == dpt.asarray(4 * 5 * 7, dtype="i4")) + + +def test_sum_keepdims(): + get_queue_or_skip() + + m = dpt.ones((3, 4, 5, 6, 7), dtype="i4") + s = dpt.sum(m, axis=(1, 2, -1), keepdims=True) + + assert isinstance(s, dpt.usm_ndarray) + assert s.shape == (3, 1, 1, 6, 1) + assert dpt.all(s == dpt.asarray(4 * 5 * 7, dtype=s.dtype)) + + +def test_sum_scalar(): + get_queue_or_skip() + + m = dpt.ones(()) + s = dpt.sum(m) + + assert isinstance(s, dpt.usm_ndarray) + assert m.sycl_queue == s.sycl_queue + assert s.shape == () + assert s == dpt.full((), 1) + + +@pytest.mark.parametrize("arg_dtype", _all_dtypes) +@pytest.mark.parametrize("out_dtype", _all_dtypes[1:]) +def test_sum_arg_out_dtype_scalar(arg_dtype, out_dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(arg_dtype, q) + skip_if_dtype_not_supported(out_dtype, q) + + m = dpt.ones((), dtype=arg_dtype) + r = dpt.sum(m, dtype=out_dtype) + + assert isinstance(r, dpt.usm_ndarray) + assert r.dtype == dpt.dtype(out_dtype) + assert r == 1 + + +def test_sum_keepdims_zero_size(): + """See gh-1293""" + get_queue_or_skip() + n = 10 + a = dpt.ones((n, 0, n)) + + s1 = dpt.sum(a, keepdims=True) + assert s1.shape == (1, 1, 1) + + s2 = dpt.sum(a, axis=(0, 1), keepdims=True) + assert s2.shape == (1, 1, n) + + s3 = dpt.sum(a, axis=(1, 2), keepdims=True) + assert s3.shape == (n, 1, 1) + + s4 = dpt.sum(a, axis=(0, 2), keepdims=True) + assert s4.shape == (1, 0, 1) + + a0 = a[0] + s5 = dpt.sum(a0, keepdims=True) + assert s5.shape == (1, 1) + + +@pytest.mark.parametrize("arg_dtype", ["i8", "f4", "c8"]) +@pytest.mark.parametrize("n", [1023, 1024, 1025]) +def test_largish_reduction(arg_dtype, n): + q = get_queue_or_skip() + skip_if_dtype_not_supported(arg_dtype, q) + + m = 5 + x = dpt.ones((m, n, m), dtype=arg_dtype) + + y1 = dpt.sum(x, axis=(0, 1)) + y2 = dpt.sum(x, axis=(1, 2)) + + assert dpt.all(dpt.equal(y1, y2)) + assert dpt.all(dpt.equal(y1, n * m)) + + +@pytest.mark.parametrize("n", [1023, 1024, 1025]) +def test_largish_reduction_axis1_axis0(n): + get_queue_or_skip() + + m = 25 + x1 = dpt.ones((m, n), dtype="f4") + x2 = dpt.ones((n, m), dtype="f4") + + y1 = dpt.sum(x1, axis=1) + y2 = dpt.sum(x2, axis=0) + + assert dpt.all(y1 == n) + assert dpt.all(y2 == n) + + +def test_axis0_bug(): + "gh-1391" + get_queue_or_skip() + + sh = (1, 2, 3) + a = dpt.arange(sh[0] * sh[1] * sh[2], dtype="i4") + a = dpt.reshape(a, sh) + aT = dpt.permute_dims(a, (2, 1, 0)) + + s = dpt.sum(aT, axis=2) + expected = dpt.asarray([[0, 3], [1, 4], [2, 5]]) + + assert dpt.all(s == expected) + + +def test_sum_axis1_axis0(): + """See gh-1455""" + get_queue_or_skip() + + # The atomic case is checked in `test_usm_ndarray_reductions` + # This test checks the tree reduction path for correctness + x = dpt.reshape(dpt.arange(3 * 4 * 5, dtype="f4"), (3, 4, 5)) + + m = dpt.sum(x, axis=0) + expected = dpt.asarray( + [ + [60, 63, 66, 69, 72], + [75, 78, 81, 84, 87], + [90, 93, 96, 99, 102], + [105, 108, 111, 114, 117], + ], + dtype="f4", + ) + tol = dpt.finfo(m.dtype).resolution + assert dpt.allclose(m, expected, atol=tol, rtol=tol) + + x = dpt.flip(x, axis=2) + m = dpt.sum(x, axis=2) + expected = dpt.asarray( + [[10, 35, 60, 85], [110, 135, 160, 185], [210, 235, 260, 285]], + dtype="f4", + ) + assert dpt.allclose(m, expected, atol=tol, rtol=tol) + + +@pytest.mark.parametrize("arg_dtype", _all_dtypes[1:]) +def test_prod_arg_dtype_default_output_dtype_matrix(arg_dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(arg_dtype, q) + + arg_dtype = dpt.dtype(arg_dtype) + + m = dpt.ones(100, dtype=arg_dtype) + r = dpt.prod(m) + + assert isinstance(r, dpt.usm_ndarray) + if m.dtype.kind == "i": + assert r.dtype.kind == "i" + elif m.dtype.kind == "u": + assert r.dtype.kind == "u" + elif m.dtype.kind == "f": + assert r.dtype.kind == "f" + elif m.dtype.kind == "c": + assert r.dtype.kind == "c" + assert dpt.all(r == 1) + + if dpt.isdtype(m.dtype, "unsigned integer"): + m = dpt.tile(dpt.arange(1, 3, dtype=arg_dtype), 10)[:1:-2] + r = dpt.prod(m) + assert dpt.all(r == dpt.asarray(512, dtype=r.dtype)) + else: + m = dpt.full(200, -1, dtype=arg_dtype)[:1:-2] + r = dpt.prod(m) + assert dpt.all(r == dpt.asarray(-1, dtype=r.dtype)) + + +def test_prod_empty(): + get_queue_or_skip() + x = dpt.empty((0,), dtype="u1") + y = dpt.prod(x) + assert y.shape == () + assert int(y) == 1 + + +def test_prod_axis(): + get_queue_or_skip() + + m = dpt.ones((3, 4, 5, 6, 7), dtype="i4") + s = dpt.prod(m, axis=(1, 2, -1)) + + assert isinstance(s, dpt.usm_ndarray) + assert s.shape == (3, 6) + assert dpt.all(s == dpt.asarray(1, dtype="i4")) + + +@pytest.mark.parametrize("arg_dtype", _all_dtypes) +@pytest.mark.parametrize("out_dtype", _all_dtypes[1:]) +def test_prod_arg_out_dtype_matrix(arg_dtype, out_dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(arg_dtype, q) + skip_if_dtype_not_supported(out_dtype, q) + + out_dtype = dpt.dtype(out_dtype) + arg_dtype = dpt.dtype(arg_dtype) + + m = dpt.ones(100, dtype=arg_dtype) + r = dpt.prod(m, dtype=out_dtype) + + assert isinstance(r, dpt.usm_ndarray) + assert r.dtype == dpt.dtype(out_dtype) + assert dpt.all(r == 1) + + +def test_gh_1468(): + "See https://github.com/IntelPython/dpctl/issues/1468" + get_queue_or_skip() + + a = dpt.full((2, 3, 4), 123456789, dtype=dpt.int32) + t = dpt.sum(a, dtype="f4") + assert t > 0 + + +@pytest.mark.parametrize( + "dt", ["i1", "i2", "i4", "i8", "f2", "f4", "f8", "c8", "c16"] +) +def test_gh_1944(dt): + "See https://github.com/IntelPython/dpctl/issues/1944" + q = get_queue_or_skip() + skip_if_dtype_not_supported(dt, q) + x = dpt.asarray([-1, 1], dtype=dpt.dtype(dt), sycl_queue=q) + r = dpt.sum(x, dtype="?") + # reduction must be performed in the requested dtype + # if performed in the input type, result is False + assert r diff --git a/dpnp/tests/tensor/test_tensor_testing.py b/dpnp/tests/tensor/test_tensor_testing.py new file mode 100644 index 000000000000..34cc40987354 --- /dev/null +++ b/dpnp/tests/tensor/test_tensor_testing.py @@ -0,0 +1,181 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import itertools + +import pytest + +import dpnp.tensor as dpt + +from .helper import ( + get_queue_or_skip, + skip_if_dtype_not_supported, +) + +_all_dtypes = [ + "?", + "i1", + "u1", + "i2", + "u2", + "i4", + "u4", + "i8", + "u8", + "f2", + "f4", + "f8", + "c8", + "c16", +] + + +@pytest.mark.parametrize("dtype", _all_dtypes) +def test_allclose(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + a1 = dpt.ones(10, dtype=dtype) + a2 = dpt.ones(10, dtype=dtype) + + assert dpt.allclose(a1, a2) + + +@pytest.mark.parametrize("dtype", ["f2", "f4", "f8"]) +def test_allclose_real_fp(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + v = [dpt.nan, -dpt.nan, dpt.inf, -dpt.inf, -0.0, 0.0, 1.0, -1.0] + a1 = dpt.asarray(v[2:], dtype=dtype) + a2 = dpt.asarray(v[2:], dtype=dtype) + + tol = dpt.finfo(a1.dtype).resolution + assert dpt.allclose(a1, a2, atol=tol, rtol=tol) + + a1 = dpt.asarray(v, dtype=dtype) + a2 = dpt.asarray(v, dtype=dtype) + + assert not dpt.allclose(a1, a2, atol=tol, rtol=tol) + assert dpt.allclose(a1, a2, atol=tol, rtol=tol, equal_nan=True) + + +@pytest.mark.parametrize("dtype", ["c8", "c16"]) +def test_allclose_complex_fp(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + v = [dpt.nan, -dpt.nan, dpt.inf, -dpt.inf, -0.0, 0.0, 1.0, -1.0] + + not_nans = [complex(*xy) for xy in itertools.product(v[2:], repeat=2)] + z1 = dpt.asarray(not_nans, dtype=dtype) + z2 = dpt.asarray(not_nans, dtype=dtype) + + tol = dpt.finfo(z1.dtype).resolution + assert dpt.allclose(z1, z2, atol=tol, rtol=tol) + + both = [complex(*xy) for xy in itertools.product(v, repeat=2)] + z1 = dpt.asarray(both, dtype=dtype) + z2 = dpt.asarray(both, dtype=dtype) + + tol = dpt.finfo(z1.dtype).resolution + assert not dpt.allclose(z1, z2, atol=tol, rtol=tol) + assert dpt.allclose(z1, z2, atol=tol, rtol=tol, equal_nan=True) + + +def test_allclose_validation(): + with pytest.raises(TypeError): + dpt.allclose(True, False) + + get_queue_or_skip() + x = dpt.asarray(True) + with pytest.raises(TypeError): + dpt.allclose(x, False) + + +def test_allclose_type_promotion(): + get_queue_or_skip() + + x1 = dpt.ones(10, dtype="i4") + x2 = dpt.ones(10, dtype="i8") + + assert dpt.allclose(x1, x2) + + +def test_allclose_tolerance(): + get_queue_or_skip() + + x = dpt.zeros(10, dtype="f4") + atol = 1e-5 + y = dpt.full_like(x, atol) + assert dpt.allclose(x, y, atol=atol, rtol=0) + + # about 8e-6 + tol = float.fromhex("0x1.0p-17") + x = dpt.ones(10, dtype="f4") + y = x - tol + assert dpt.allclose(x, y, atol=0, rtol=tol) + + +def test_allclose_real_fp_early_exists(): + get_queue_or_skip() + + x1 = dpt.asarray([0.0, dpt.inf, -dpt.inf], dtype="f4") + x2 = dpt.asarray([dpt.inf, 0.0, -dpt.inf], dtype="f4") + + # early exists, inf positions are different + assert not dpt.allclose(x1, x2) + + x2 = dpt.asarray([0.0, -dpt.inf, dpt.inf], dtype="f4") + + # early exists, inf positions are the same, but signs differ + assert not dpt.allclose(x1, x2) + + +def test_allclose_complex_fp_early_exists(): + get_queue_or_skip() + + x1 = dpt.asarray([0.0, dpt.inf, -dpt.inf], dtype="c8") + x2 = dpt.asarray([dpt.inf, 0.0, -dpt.inf], dtype="c8") + + # early exists, inf positions of real parts are different + assert not dpt.allclose(x1, x2) + + x2 = dpt.asarray([0.0, -dpt.inf, dpt.inf], dtype="c8") + + # early exists, inf positions of real parts are the same, but signs differ + assert not dpt.allclose(x1, x2) + + x1 = dpt.asarray([0.0, dpt.inf * 1j, -dpt.inf * 1j], dtype="c8") + x2 = dpt.asarray([dpt.inf * 1j, 0.0, -dpt.inf * 1j], dtype="c8") + + # early exists, inf positions of imag parts are different + assert not dpt.allclose(x1, x2) + + x2 = dpt.asarray([0.0, -dpt.inf * 1j, dpt.inf * 1j], dtype="c8") + assert not dpt.allclose(x1, x2) diff --git a/dpnp/tests/tensor/test_usm_ndarray_ctor.py b/dpnp/tests/tensor/test_usm_ndarray_ctor.py new file mode 100644 index 000000000000..70066860b19f --- /dev/null +++ b/dpnp/tests/tensor/test_usm_ndarray_ctor.py @@ -0,0 +1,2324 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import ctypes +import numbers +from math import prod + +import dpctl +import dpctl.memory as dpm +import numpy as np +import pytest +from numpy.testing import assert_raises_regex + +import dpnp.tensor as dpt +from dpnp.tensor import Device + +from .helper import ( + get_queue_or_skip, + skip_if_dtype_not_supported, +) + +_all_dtypes = [ + "b1", + "i1", + "u1", + "i2", + "u2", + "i4", + "u4", + "i8", + "u8", + "f2", + "f4", + "f8", + "c8", + "c16", +] + + +@pytest.mark.parametrize( + "shape", + [ + (), + (4,), + (0,), + (0, 1), + (0, 0), + (4, 5), + (2, 5, 2), + (2, 2, 2, 2, 2, 2, 2, 2), + 5, + np.int32(7), + ], +) +@pytest.mark.parametrize("usm_type", ["shared", "host", "device"]) +def test_allocate_usm_ndarray(shape, usm_type): + q = get_queue_or_skip() + X = dpt.usm_ndarray( + shape, dtype="i8", buffer=usm_type, buffer_ctor_kwargs={"queue": q} + ) + Xnp = np.ndarray(shape, dtype="i8") + assert X.usm_type == usm_type + assert X.sycl_context == q.sycl_context + assert X.sycl_device == q.sycl_device + assert X.size == Xnp.size + assert X.shape == Xnp.shape + assert X.shape == X.__sycl_usm_array_interface__["shape"] + + +def test_usm_ndarray_flags(): + get_queue_or_skip() + f = dpt.usm_ndarray((5,), dtype="i4").flags + assert f.fc + assert f.forc + + f = dpt.usm_ndarray((5, 2), dtype="i4").flags + assert f.c_contiguous + assert f.forc + + f = dpt.usm_ndarray((5, 2), dtype="i4", order="F").flags + assert f.f_contiguous + assert f.forc + assert f.fnc + + f = dpt.usm_ndarray((5,), dtype="i4", strides=(1,)).flags + assert f.fc + assert f.forc + + f = dpt.usm_ndarray((5, 1, 2), dtype="i4", strides=(2, 0, 1)).flags + assert f.c_contiguous + assert f.forc + + f = dpt.usm_ndarray((5, 1, 2), dtype="i4", strides=(1, 0, 5)).flags + assert f.f_contiguous + assert f.forc + assert f.fnc + + f = dpt.usm_ndarray((5, 0, 1), dtype="i4", strides=(1, 0, 1)).flags + assert f.fc + assert f.forc + assert not dpt.usm_ndarray( + (5, 1, 1), dtype="i4", strides=(2, 0, 1) + ).flags.forc + + x = dpt.empty(5, dtype="u2") + assert x.flags.writable is True + x.flags.writable = False + assert x.flags.writable is False + with pytest.raises(ValueError): + x[:] = 0 + x.flags["W"] = True + assert x.flags.writable is True + x.flags["WRITABLE"] = True + assert x.flags.writable is True + x[:] = 0 + + with pytest.raises(TypeError): + x.flags.writable = {} + with pytest.raises(ValueError): + x.flags["C"] = False + + +def test_usm_ndarray_flags_bug_gh_1334(): + get_queue_or_skip() + a = dpt.ones((2, 3), dtype="u4") + r = dpt.reshape(a, (1, 6, 1)) + assert r.flags["C"] and r.flags["F"] + + a = dpt.ones((2, 3), dtype="u4", order="F") + r = dpt.reshape(a, (1, 6, 1), order="F") + assert r.flags["C"] and r.flags["F"] + + a = dpt.ones((2, 3, 4), dtype="i8") + r = dpt.sum(a, axis=(1, 2), keepdims=True) + assert r.flags["C"] and r.flags["F"] + + a = dpt.ones((2, 1), dtype="?") + r = a[:, 1::-1] + assert r.flags["F"] and r.flags["C"] + + +def test_usm_ndarray_writable_flag_views(): + get_queue_or_skip() + a = dpt.arange(10, dtype="f4") + a.flags["W"] = False + + a.shape = (5, 2) + assert not a.flags.writable + assert not a.T.flags.writable + assert not a.mT.flags.writable + assert not a.real.flags.writable + assert not a[0:3].flags.writable + + a = dpt.arange(10, dtype="c8") + a.flags["W"] = False + + assert not a.real.flags.writable + assert not a.imag.flags.writable + + +@pytest.mark.parametrize("dt1", _all_dtypes) +@pytest.mark.parametrize("dt2", _all_dtypes) +def test_usm_ndarray_from_zero_sized_usm_ndarray(dt1, dt2): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dt1, q) + skip_if_dtype_not_supported(dt2, q) + + x1 = dpt.ones((0,), dtype=dt1, sycl_queue=q) + x2 = dpt.usm_ndarray(x1.shape, dtype=dt2, buffer=x1) + assert x2.dtype == dt2 + assert x2.sycl_queue == q + assert x2._pointer == x1._pointer + assert x2.shape == x1.shape + + +def test_usm_ndarray_from_usm_ndarray_readonly(): + get_queue_or_skip() + + x1 = dpt.arange(10, dtype="f4") + x1.flags["W"] = False + x2 = dpt.usm_ndarray(x1.shape, dtype="f4", buffer=x1) + assert not x2.flags.writable + + +@pytest.mark.parametrize( + "dtype", + _all_dtypes + + [ + b"float32", + dpt.dtype("d"), + np.half, + ], +) +def test_dtypes(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + Xusm = dpt.usm_ndarray((1,), dtype=dtype) + assert Xusm.itemsize == dpt.dtype(dtype).itemsize + expected_fmt = (dpt.dtype(dtype).str)[1:] + actual_fmt = Xusm.__sycl_usm_array_interface__["typestr"][1:] + assert expected_fmt == actual_fmt + + +@pytest.mark.parametrize("usm_type", ["device", "shared", "host"]) +@pytest.mark.parametrize("buffer_ctor_kwargs", [dict(), {"queue": None}]) +def test_default_dtype(usm_type, buffer_ctor_kwargs): + q = get_queue_or_skip() + dev = q.get_sycl_device() + if buffer_ctor_kwargs: + buffer_ctor_kwargs["queue"] = q + Xusm = dpt.usm_ndarray( + (1,), buffer=usm_type, buffer_ctor_kwargs=buffer_ctor_kwargs + ) + if dev.has_aspect_fp64: + expected_dtype = "f8" + else: + expected_dtype = "f4" + assert Xusm.itemsize == dpt.dtype(expected_dtype).itemsize + expected_fmt = (dpt.dtype(expected_dtype).str)[1:] + actual_fmt = Xusm.__sycl_usm_array_interface__["typestr"][1:] + assert expected_fmt == actual_fmt + + +@pytest.mark.parametrize( + "dtype", + [ + "", + ">f4", + "invalid", + 123, + np.dtype(">f4"), + np.dtype([("a", ">f4"), ("b", "i4")]), + ], +) +def test_dtypes_invalid(dtype): + with pytest.raises((TypeError, ValueError)): + dpt.usm_ndarray((1,), dtype=dtype) + + +@pytest.mark.parametrize("dt", ["f", "c8"]) +def test_properties(dt): + """ + Test that properties execute + """ + try: + X = dpt.usm_ndarray((3, 4, 5), dtype=dt) + except dpctl.SyclDeviceCreationError: + pytest.skip("No SYCL devices available") + assert isinstance(X.sycl_queue, dpctl.SyclQueue) + assert isinstance(X.sycl_device, dpctl.SyclDevice) + assert isinstance(X.sycl_context, dpctl.SyclContext) + assert isinstance(X.dtype, dpt.dtype) + assert isinstance(X.__sycl_usm_array_interface__, dict) + assert isinstance(X.mT, dpt.usm_ndarray) + assert isinstance(X.imag, dpt.usm_ndarray) + assert isinstance(X.real, dpt.usm_ndarray) + assert isinstance(X.shape, tuple) + assert isinstance(X.strides, tuple) + assert X.usm_type in ("shared", "device", "host") + assert isinstance(X.size, numbers.Integral) + assert isinstance(X.nbytes, numbers.Integral) + assert isinstance(X.ndim, numbers.Integral) + assert isinstance(X._pointer, numbers.Integral) + assert isinstance(X.device, Device) + with pytest.raises(ValueError): + # array-API mandates exception for .ndim != 2 + X.T + Y = dpt.usm_ndarray((2, 3), dtype=dt) + assert isinstance(Y.mT, dpt.usm_ndarray) + V = dpt.usm_ndarray((3,), dtype=dt) + with pytest.raises(ValueError): + # array-API mandates exception for .ndim != 2 + V.mT + + +@pytest.mark.parametrize("shape", [tuple(), (1,), (1, 1), (1, 1, 1)]) +@pytest.mark.parametrize("dtype", ["|b1", "|u2", "|f4", "|i8"]) +class TestCopyScalar: + @pytest.mark.parametrize("func", [bool, float, int, complex]) + def test_copy_scalar_with_func(self, func, shape, dtype): + try: + X = dpt.usm_ndarray(shape, dtype=dtype) + except dpctl.SyclDeviceCreationError: + pytest.skip("No SYCL devices available") + Y = np.arange(1, X.size + 1, dtype=dtype) + X.usm_data.copy_from_host(Y.view("|u1")) + Y = Y.reshape(()) + # Non-0D numeric arrays must not be convertible to Python scalars + if len(shape) != 0: + assert_raises_regex(TypeError, "only 0-dimensional arrays", func, X) + else: + # 0D arrays are allowed to convert + assert func(X) == func(Y) + + @pytest.mark.parametrize( + "method", ["__bool__", "__float__", "__int__", "__complex__"] + ) + def test_copy_scalar_with_method(self, method, shape, dtype): + try: + X = dpt.usm_ndarray(shape, dtype=dtype) + except dpctl.SyclDeviceCreationError: + pytest.skip("No SYCL devices available") + Y = np.arange(1, X.size + 1, dtype=dtype) + X.usm_data.copy_from_host(Y.view("|u1")) + Y = Y.reshape(()) + if len(shape) != 0: + assert_raises_regex( + TypeError, "only 0-dimensional arrays", getattr(X, method) + ) + else: + assert getattr(X, method)() == getattr(Y, method)() + + +@pytest.mark.parametrize("func", [bool, float, int, complex]) +@pytest.mark.parametrize("shape", [(2,), (1, 2), (3, 4, 5), (0,)]) +def test_copy_scalar_invalid_shape(func, shape): + try: + X = dpt.usm_ndarray(shape, dtype="i8") + except dpctl.SyclDeviceCreationError: + pytest.skip("No SYCL devices available") + with pytest.raises(ValueError): + func(X) + + +def test_index_noninteger(): + import operator + + try: + X = dpt.usm_ndarray(1, "f4") + except dpctl.SyclDeviceCreationError: + pytest.skip("No SYCL devices available") + with pytest.raises(IndexError): + operator.index(X) + + +@pytest.mark.parametrize( + "ind", + [ + tuple(), + (None,), + ( + None, + Ellipsis, + None, + ), + (2, 2, None, 3, 4), + (Ellipsis,), + (None, slice(0, None, 2), Ellipsis, slice(0, None, 3)), + (None, slice(1, None, 2), Ellipsis, slice(1, None, 3)), + (None, slice(None, -1, -2), Ellipsis, slice(2, None, 3)), + ( + slice(None, None, -1), + slice(None, None, -1), + slice(0, None, 3), + slice(1, None, 2), + ), + ], +) +def test_basic_slice(ind): + try: + X = dpt.usm_ndarray((2 * 3, 2 * 4, 3 * 5, 2 * 7), dtype="u1") + except dpctl.SyclDeviceCreationError: + pytest.skip("No SYCL devices available") + Xnp = np.empty(X.shape, dtype=X.dtype) + S = X[ind] + Snp = Xnp[ind] + assert S.shape == Snp.shape + assert S.strides == Snp.strides + assert S.dtype == X.dtype + + +def test_empty_slice(): + # see gh801 + try: + X = dpt.empty((1, 0, 1), dtype="u1") + except dpctl.SyclDeviceCreationError: + pytest.skip("No SYCL devices available") + Y = X[:, ::-1, :] + assert Y.shape == X.shape + Z = X[:, ::2, :] + assert Z.shape == X.shape + X = dpt.empty(0) + Y = X[::-1] + assert Y.shape == X.shape + Z = X[::2] + assert Z.shape == X.shape + X = dpt.empty((0, 4), dtype="u1") + assert X[:, 1].shape == (0,) + assert X[:, 1:3].shape == (0, 2) + + +def test_slice_constructor_1d(): + Xh = np.arange(37, dtype="i4") + try: + Xusm = dpt.arange(Xh.size, dtype="i4") + except dpctl.SyclDeviceCreationError: + pytest.skip("No SYCL devices available") + for ind in [ + slice(1, None, 2), + slice(0, None, 3), + slice(1, None, 3), + slice(2, None, 3), + slice(None, None, -1), + slice(-2, 2, -2), + slice(-1, 1, -2), + slice(None, None, -13), + ]: + assert np.array_equal( + dpt.asnumpy(Xusm[ind]), Xh[ind] + ), "Failed for {}".format(ind) + + +def test_slice_constructor_3d(): + Xh = np.ones((37, 24, 35), dtype="i4") + try: + Xusm = dpt.ones(Xh.shape, dtype=Xh.dtype) + except dpctl.SyclDeviceCreationError: + pytest.skip("No SYCL devices available") + for ind in [ + slice(1, None, 2), + slice(0, None, 3), + slice(1, None, 3), + slice(2, None, 3), + slice(None, None, -1), + slice(-2, 2, -2), + slice(-1, 1, -2), + slice(None, None, -13), + (slice(None, None, -2), Ellipsis, None, 15), + ]: + assert np.array_equal( + dpt.to_numpy(Xusm[ind]), Xh[ind] + ), "Failed for {}".format(ind) + + +@pytest.mark.parametrize("usm_type", ["device", "shared", "host"]) +def test_slice_suai(usm_type): + Xh = np.arange(0, 10, dtype="u1") + try: + Xusm = dpt.arange(0, 10, dtype="u1", usm_type=usm_type) + except dpctl.SyclDeviceCreationError: + pytest.skip("No SYCL devices available") + for ind in [slice(2, 3, None), slice(5, 7, None), slice(3, 9, None)]: + assert np.array_equal( + dpm.as_usm_memory(Xusm[ind]).copy_to_host(), Xh[ind] + ), "Failed for {}".format(ind) + + +def test_slicing_basic(): + try: + Xusm = dpt.usm_ndarray((10, 5), dtype="c8") + except dpctl.SyclDeviceCreationError: + pytest.skip("No SYCL devices available") + Xusm[None] + Xusm[...] + Xusm[8] + Xusm[-3] + with pytest.raises(IndexError): + Xusm[..., ...] + with pytest.raises(IndexError): + Xusm[1, 1, :, 1] + Xusm[:, -4] + with pytest.raises(IndexError): + Xusm[:, -128] + with pytest.raises(IndexError): + Xusm[{1, 2, 3, 4, 5, 6, 7}] + X = dpt.usm_ndarray(10, "u1") + X.usm_data.copy_from_host(b"\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09") + int( + X[X[2]] + ) # check that objects with __index__ method can be used as indices + Xh = dpm.as_usm_memory(X[X[2] : X[5]]).copy_to_host() + Xnp = np.arange(0, 10, dtype="u1") + assert np.array_equal(Xh, Xnp[Xnp[2] : Xnp[5]]) + + +def test_slicing_empty(): + try: + X = dpt.usm_ndarray((0, 10), dtype="i4") + except dpctl.SyclDeviceCreationError: + pytest.skip("No SYCL devices available") + x = dpt.moveaxis(X, 1, 0) + # this used to raise ValueError + y = x[1] + assert y.ndim == 1 + assert y.shape == (0,) + assert y.dtype == X.dtype + assert y.usm_type == X.usm_type + assert y.sycl_queue == X.sycl_queue + w = x[1:3] + assert w.ndim == 2 + assert w.shape == ( + 2, + 0, + ) + assert w.dtype == X.dtype + assert w.usm_type == X.usm_type + assert w.sycl_queue == X.sycl_queue + + +def test_ctor_invalid_shape(): + with pytest.raises(TypeError): + dpt.usm_ndarray(dict()) + + +def test_ctor_invalid_order(): + get_queue_or_skip() + with pytest.raises(ValueError): + dpt.usm_ndarray((5, 5, 3), order="Z") + with pytest.raises(ValueError): + dpt.usm_ndarray((10), strides=(1,), order="Z") + with pytest.raises(ValueError): + dpt.usm_ndarray((), order="Z") + + +def test_ctor_buffer_kwarg(): + try: + dpt.usm_ndarray(10, dtype="i8", buffer=b"device") + except dpctl.SyclDeviceCreationError: + pytest.skip("No SYCL devices available") + with pytest.raises(ValueError): + dpt.usm_ndarray(10, buffer="invalid_param") + Xusm = dpt.usm_ndarray((10, 5), dtype="c8") + Xusm[...] = 1 + X2 = dpt.usm_ndarray(Xusm.shape, buffer=Xusm, dtype=Xusm.dtype) + Horig_copy = Xusm.usm_data.copy_to_host() + H2_copy = X2.usm_data.copy_to_host() + assert np.array_equal(Horig_copy, H2_copy) + with pytest.raises(ValueError): + dpt.usm_ndarray(10, dtype="i4", buffer=dict()) + # use device-specific default fp data type + X3 = dpt.usm_ndarray(Xusm.shape, buffer=Xusm) + assert np.array_equal(Horig_copy, X3.usm_data.copy_to_host()) + + +def test_usm_ndarray_props(): + try: + Xusm = dpt.usm_ndarray((10, 5), dtype="c8", order="F") + except dpctl.SyclDeviceCreationError: + pytest.skip("No SYCL devices available") + Xusm.ndim + repr(Xusm) + Xusm.flags + Xusm.__sycl_usm_array_interface__ + Xusm.device + Xusm.strides + Xusm.real + Xusm.imag + try: + dpctl.SyclQueue("cpu") + except dpctl.SyclQueueCreationError: + pytest.skip("Sycl device CPU was not detected") + Xusm.to_device("cpu") + + +def test_datapi_device(): + try: + X = dpt.usm_ndarray(1, dtype="i4") + except dpctl.SyclDeviceCreationError: + pytest.skip("No SYCL devices available") + dev_t = type(X.device) + with pytest.raises(TypeError): + dev_t() + dev_t.create_device(X.device) + dev_t.create_device(X.sycl_queue) + d1 = dev_t.create_device(X.sycl_device) + d2 = dev_t.create_device(X.sycl_device.filter_string) + d3 = dev_t.create_device(None) + assert d1.sycl_queue == d2.sycl_queue + assert d1.sycl_queue == d3.sycl_queue + X.device.sycl_context + X.device.sycl_queue + X.device.sycl_device + repr(X.device) + X.device.print_device_info() + + +def _pyx_capi_int(X, pyx_capi_name, caps_name=b"int", val_restype=ctypes.c_int): + import sys + + mod = sys.modules[X.__class__.__module__] + cap = mod.__pyx_capi__.get(pyx_capi_name, None) + if cap is None: + raise ValueError( + "__pyx_capi__ does not export {} capsule".format(pyx_capi_name) + ) + # construct Python callable to invoke these functions + cap_ptr_fn = ctypes.pythonapi.PyCapsule_GetPointer + cap_ptr_fn.restype = ctypes.c_void_p + cap_ptr_fn.argtypes = [ctypes.py_object, ctypes.c_char_p] + cap_ptr = cap_ptr_fn(cap, caps_name) + val_ptr = ctypes.cast(cap_ptr, ctypes.POINTER(val_restype)) + return val_ptr.contents.value + + +def test_pyx_capi_check_constants(): + try: + X = dpt.usm_ndarray(17, dtype="i1")[1::2] + except dpctl.SyclDeviceCreationError: + pytest.skip("No SYCL devices available") + cc_flag = _pyx_capi_int(X, "USM_ARRAY_C_CONTIGUOUS") + assert cc_flag > 0 and 0 == (cc_flag & (cc_flag - 1)) + fc_flag = _pyx_capi_int(X, "USM_ARRAY_F_CONTIGUOUS") + assert fc_flag > 0 and 0 == (fc_flag & (fc_flag - 1)) + w_flag = _pyx_capi_int(X, "USM_ARRAY_WRITABLE") + assert w_flag > 0 and 0 == (w_flag & (w_flag - 1)) + + bool_typenum = _pyx_capi_int(X, "UAR_BOOL") + assert bool_typenum == dpt.dtype("bool_").num + + byte_typenum = _pyx_capi_int(X, "UAR_BYTE") + assert byte_typenum == dpt.dtype(np.byte).num + ubyte_typenum = _pyx_capi_int(X, "UAR_UBYTE") + assert ubyte_typenum == dpt.dtype(np.ubyte).num + + short_typenum = _pyx_capi_int(X, "UAR_SHORT") + assert short_typenum == dpt.dtype(np.short).num + ushort_typenum = _pyx_capi_int(X, "UAR_USHORT") + assert ushort_typenum == dpt.dtype(np.ushort).num + + int_typenum = _pyx_capi_int(X, "UAR_INT") + assert int_typenum == dpt.dtype(np.intc).num + uint_typenum = _pyx_capi_int(X, "UAR_UINT") + assert uint_typenum == dpt.dtype(np.uintc).num + + long_typenum = _pyx_capi_int(X, "UAR_LONG") + assert long_typenum == dpt.dtype("l").num + ulong_typenum = _pyx_capi_int(X, "UAR_ULONG") + assert ulong_typenum == dpt.dtype("L").num + + longlong_typenum = _pyx_capi_int(X, "UAR_LONGLONG") + assert longlong_typenum == dpt.dtype(np.longlong).num + ulonglong_typenum = _pyx_capi_int(X, "UAR_ULONGLONG") + assert ulonglong_typenum == dpt.dtype(np.ulonglong).num + + half_typenum = _pyx_capi_int(X, "UAR_HALF") + assert half_typenum == dpt.dtype(np.half).num + float_typenum = _pyx_capi_int(X, "UAR_FLOAT") + assert float_typenum == dpt.dtype(np.single).num + double_typenum = _pyx_capi_int(X, "UAR_DOUBLE") + assert double_typenum == dpt.dtype(np.double).num + + cfloat_typenum = _pyx_capi_int(X, "UAR_CFLOAT") + assert cfloat_typenum == dpt.dtype(np.csingle).num + cdouble_typenum = _pyx_capi_int(X, "UAR_CDOUBLE") + assert cdouble_typenum == dpt.dtype(np.cdouble).num + + +@pytest.mark.parametrize( + "shape", [tuple(), (1,), (5,), (2, 3), (2, 3, 4), (2, 2, 2, 2, 2)] +) +@pytest.mark.parametrize( + "dtype", + _all_dtypes, +) +@pytest.mark.parametrize("usm_type", ["device", "shared", "host"]) +def test_tofrom_numpy(shape, dtype, usm_type): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + Xusm = dpt.zeros(shape, dtype=dtype, usm_type=usm_type, sycl_queue=q) + Ynp = np.ones(shape, dtype=dtype) + Ynp[(0,) * len(shape)] = 0 + ind = (slice(None, None, None),) * Ynp.ndim + Xusm[ind] = Ynp + assert np.array_equal(dpt.to_numpy(Xusm), Ynp) + + +@pytest.mark.parametrize( + "dtype", + _all_dtypes, +) +@pytest.mark.parametrize("usm_type", ["device", "shared", "host"]) +def test_tofrom_numpy_permuted(dtype, usm_type): + shape = (3, 5, 7) + perm = (1, 2, 0) + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + Xusm = dpt.permute_dims( + dpt.zeros(shape, dtype=dtype, usm_type=usm_type, sycl_queue=q), perm + ) + Ynp = np.transpose(np.ones(shape, dtype=dtype), perm) + Ynp[:, ::2, ::2] = 0 + ind = (slice(None, None, None),) * Ynp.ndim + # even though Xusm and Ynp are strided, simple memcpy could be done. + # This test validates that it is being done correctly + Xusm[ind] = Ynp + assert np.array_equal(dpt.to_numpy(Xusm), Ynp) + + +@pytest.mark.parametrize( + "dtype", + _all_dtypes, +) +@pytest.mark.parametrize("src_usm_type", ["device", "shared", "host"]) +@pytest.mark.parametrize("dst_usm_type", ["device", "shared", "host"]) +def test_setitem_same_dtype(dtype, src_usm_type, dst_usm_type): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + shape = (2, 4, 3) + Xnp = ( + np.random.randint(-10, 10, size=prod(shape)) + .astype(dtype) + .reshape(shape) + ) + X = dpt.from_numpy(Xnp, usm_type=src_usm_type) + Z = dpt.zeros(shape, dtype=dtype, usm_type=dst_usm_type) + Zusm_0d = dpt.copy(Z[0, 0, 0]) + ind = (-1, -1, -1) + Xusm_0d = X[ind] + Zusm_0d[Ellipsis] = Xusm_0d + assert np.array_equal(dpt.to_numpy(Zusm_0d), Xnp[ind]) + Zusm_1d = dpt.copy(Z[0, 1:3, 0]) + ind = (-1, slice(0, 2, None), -1) + Xusm_1d = X[ind] + Zusm_1d[Ellipsis] = Xusm_1d + assert np.array_equal(dpt.to_numpy(Zusm_1d), Xnp[ind]) + Zusm_2d = dpt.copy(Z[:, 1:3, 0])[::-1] + Xusm_2d = X[:, 1:4, -1] + Zusm_2d[:] = Xusm_2d[:, 0:2] + assert np.array_equal(dpt.to_numpy(Zusm_2d), Xnp[:, 1:3, -1]) + Zusm_3d = dpt.copy(Z) + Xusm_3d = X + Zusm_3d[:] = Xusm_3d + assert np.array_equal(dpt.to_numpy(Zusm_3d), Xnp) + Zusm_3d[::-1] = Xusm_3d[::-1] + assert np.array_equal(dpt.to_numpy(Zusm_3d), Xnp) + Zusm_3d[:] = Xusm_3d[0] + R1 = dpt.to_numpy(Zusm_3d) + R2 = np.broadcast_to(Xnp[0], R1.shape) + assert R1.shape == R2.shape + assert np.allclose(R1, R2) + Zusm_empty = Zusm_1d[0:0] + Zusm_empty[Ellipsis] = Zusm_3d[0, 0, 0:0] + + +def test_setitem_broadcasting(): + "See gh-1503" + get_queue_or_skip() + dst = dpt.ones((2, 3, 4), dtype="u4") + src = dpt.zeros((3, 1), dtype=dst.dtype) + dst[...] = src + expected = np.zeros(dst.shape, dtype=dst.dtype) + assert np.array_equal(dpt.asnumpy(dst), expected) + + +def test_setitem_broadcasting_offset(): + get_queue_or_skip() + dt = dpt.int32 + x = dpt.asarray([[1, 2, 3], [6, 7, 8]], dtype=dt) + y = dpt.asarray([4, 5], dtype=dt) + x[0] = y[1] + expected = dpt.asarray([[5, 5, 5], [6, 7, 8]], dtype=dt) + assert dpt.all(x == expected) + + +def test_setitem_broadcasting_empty_dst_validation(): + "Broadcasting rules apply, except exception" + get_queue_or_skip() + dst = dpt.ones((2, 0, 5, 4), dtype="i8") + src = dpt.ones((2, 0, 3, 4), dtype="i8") + with pytest.raises(ValueError): + dst[...] = src + + +def test_setitem_broadcasting_empty_dst_edge_case(): + """RHS is shunken to empty array by + broadasting rule, hence no exception""" + get_queue_or_skip() + dst = dpt.ones(1, dtype="i8")[0:0] + src = dpt.ones(tuple(), dtype="i8") + dst[...] = src + + +def test_setitem_broadcasting_src_ndim_equal_dst_ndim(): + get_queue_or_skip() + dst = dpt.ones((2, 3, 4), dtype="i4") + src = dpt.zeros((2, 1, 4), dtype="i4") + dst[...] = src + + expected = np.zeros(dst.shape, dtype=dst.dtype) + assert np.array_equal(dpt.asnumpy(dst), expected) + + +def test_setitem_broadcasting_src_ndim_greater_than_dst_ndim(): + get_queue_or_skip() + dst = dpt.ones((2, 3, 4), dtype="i4") + src = dpt.zeros((1, 2, 1, 4), dtype="i4") + dst[...] = src + + expected = np.zeros(dst.shape, dtype=dst.dtype) + assert np.array_equal(dpt.asnumpy(dst), expected) + + +@pytest.mark.parametrize( + "dtype", + _all_dtypes, +) +@pytest.mark.parametrize("usm_type", ["device", "shared", "host"]) +def test_setitem_scalar(dtype, usm_type): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + X = dpt.usm_ndarray((6, 6), dtype=dtype, buffer=usm_type) + for i in range(X.size): + X[np.unravel_index(i, X.shape)] = np.asarray(i, dtype=dtype) + assert np.array_equal( + dpt.to_numpy(X), np.arange(X.size).astype(dtype).reshape(X.shape) + ) + Y = dpt.usm_ndarray((2, 3), dtype=dtype, buffer=usm_type) + for i in range(Y.size): + Y[np.unravel_index(i, Y.shape)] = i + assert np.array_equal( + dpt.to_numpy(Y), np.arange(Y.size).astype(dtype).reshape(Y.shape) + ) + + +def test_setitem_errors(): + q = get_queue_or_skip() + X = dpt.empty((4,), dtype="u1", sycl_queue=q) + Y = dpt.empty((4, 2), dtype="u1", sycl_queue=q) + with pytest.raises(ValueError): + X[:] = Y + with pytest.raises(ValueError): + X[:] = Y[:, 0:1] + X[:] = Y[None, :, 0] + + +@pytest.mark.parametrize("src_dt,dst_dt", [("i4", "i8"), ("f4", "f8")]) +def test_setitem_different_dtypes(src_dt, dst_dt): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dst_dt, q) + X = dpt.ones(10, dtype=src_dt, sycl_queue=q) + Y = dpt.zeros(10, dtype=src_dt, sycl_queue=q) + Z = dpt.empty((20,), dtype=dst_dt, sycl_queue=q) + Z[::2] = X + Z[1::2] = Y + assert np.allclose(dpt.asnumpy(Z), np.tile(np.array([1, 0], Z.dtype), 10)) + + +def test_setitem_wingaps(): + q = get_queue_or_skip() + if dpt.dtype("intc").itemsize == dpt.dtype("int32").itemsize: + dpt_dst = dpt.empty(4, dtype="int32", sycl_queue=q) + np_src = np.arange(4, dtype="intc") + dpt_dst[:] = np_src # should not raise exceptions + assert np.array_equal(dpt.asnumpy(dpt_dst), np_src) + if dpt.dtype("long").itemsize == dpt.dtype("longlong").itemsize: + dpt_dst = dpt.empty(4, dtype="longlong", sycl_queue=q) + np_src = np.arange(4, dtype="long") + dpt_dst[:] = np_src # should not raise exceptions + assert np.array_equal(dpt.asnumpy(dpt_dst), np_src) + + +def test_shape_setter(): + def cc_strides(sh): + return np.empty(sh, dtype="u1").strides + + def relaxed_strides_equal(st1, st2, sh): + eq_ = True + for s1, s2, d in zip(st1, st2, sh): + eq_ = eq_ and ((d == 1) or (s1 == s2)) + return eq_ + + sh_s = (2 * 3 * 4 * 5,) + sh_f = ( + 2, + 3, + 4, + 5, + ) + try: + X = dpt.usm_ndarray(sh_s, dtype="i8") + except dpctl.SyclDeviceCreationError: + pytest.skip("No SYCL devices available") + X.shape = sh_f + assert X.shape == sh_f + assert relaxed_strides_equal(X.strides, cc_strides(sh_f), sh_f) + assert X.flags.c_contiguous, "reshaped array expected to be C-contiguous" + + sh_s = ( + 2, + 12, + 5, + ) + sh_f = ( + 2, + 3, + 4, + 5, + ) + X = dpt.usm_ndarray(sh_s, dtype="u4", order="C") + X.shape = sh_f + assert X.shape == sh_f + assert relaxed_strides_equal(X.strides, cc_strides(sh_f), sh_f) + + sh_s = (2, 3, 4, 5) + sh_f = (4, 3, 2, 5) + X = dpt.usm_ndarray(sh_s, dtype="f4") + X.shape = sh_f + assert relaxed_strides_equal(X.strides, cc_strides(sh_f), sh_f) + + sh_s = (2, 3, 4, 5) + sh_f = (4, 3, 1, 2, 5) + X = dpt.usm_ndarray(sh_s, dtype="?") + X.shape = sh_f + assert relaxed_strides_equal(X.strides, cc_strides(sh_f), sh_f) + sz = X.size + X.shape = sz + assert X.shape == (sz,) + assert relaxed_strides_equal(X.strides, (1,), (sz,)) + + X = dpt.usm_ndarray(sh_s, dtype="u4") + with pytest.raises(TypeError): + X.shape = "abcbe" + X = dpt.usm_ndarray((4, 4), dtype="u1")[::2, ::2] + with pytest.raises(AttributeError): + X.shape = (4,) + X = dpt.usm_ndarray((0,), dtype="i4") + X.shape = (0,) + X.shape = ( + 2, + 0, + ) + X.shape = ( + 0, + 2, + ) + X.shape = ( + 1, + 0, + 1, + ) + + +def test_len(): + try: + X = dpt.usm_ndarray(1, "i4") + except dpctl.SyclDeviceCreationError: + pytest.skip("No SYCL devices available") + assert len(X) == 1 + X = dpt.usm_ndarray((2, 1), "i4") + assert len(X) == 2 + X = dpt.usm_ndarray(tuple(), "i4") + with pytest.raises(TypeError): + len(X) + + +def test_array_namespace(): + try: + X = dpt.usm_ndarray(1, "i4") + except dpctl.SyclDeviceCreationError: + pytest.skip("No SYCL devices available") + X.__array_namespace__() + X._set_namespace(dpt) + assert X.__array_namespace__() is dpt + X.__array_namespace__(api_version=dpt.__array_api_version__) + assert X.__array_namespace__() is dpt + + +def test_dlpack(): + try: + X = dpt.usm_ndarray(1, "i4") + except dpctl.SyclDeviceCreationError: + pytest.skip("No SYCL devices available") + X.__dlpack_device__() + X.__dlpack__(stream=None) + + +def test_to_device(): + try: + X = dpt.usm_ndarray(1, "f4") + except dpctl.SyclDeviceCreationError: + pytest.skip("No SYCL devices available") + for dev in dpctl.get_devices(): + if dev.default_selector_score > 0: + Y = X.to_device(dev) + assert Y.sycl_device == dev + + +def test_to_device_stream_validation(): + try: + X = dpt.usm_ndarray(1, "f4") + except dpctl.SyclDeviceCreationError: + pytest.skip("No SYCL devices available") + # invalid type of stream keyword + with pytest.raises(TypeError): + X.to_device(X.sycl_queue, stream=dict()) + # stream is keyword-only arg + with pytest.raises(TypeError): + X.to_device(X.sycl_queue, X.sycl_queue) + + +def test_to_device_stream_use(): + try: + X = dpt.usm_ndarray(1, "f4") + except dpctl.SyclDeviceCreationError: + pytest.skip("No SYCL devices available") + q1 = dpctl.SyclQueue( + X.sycl_context, X.sycl_device, property="enable_profiling" + ) + X.to_device(q1, stream=q1) + + +def test_to_device_migration(): + q1 = get_queue_or_skip() # two distinct copies of default-constructed queue + q2 = get_queue_or_skip() + X1 = dpt.empty((5,), dtype="i8", sycl_queue=q1) # X1 is associated with q1 + X2 = X1.to_device(q2) # X2 is reassociated with q2 + assert X1.sycl_queue == q1 + assert X2.sycl_queue == q2 + assert X1.usm_data._pointer == X2.usm_data._pointer + + +def test_astype(): + try: + X = dpt.empty((5, 5), dtype="i4") + except dpctl.SyclDeviceCreationError: + pytest.skip("No SYCL devices available") + X[:] = np.full((5, 5), 7, dtype="i4") + Y = dpt.astype(X, "c8", order="C") + assert np.allclose(dpt.to_numpy(Y), np.full((5, 5), 7, dtype="c8")) + if Y.sycl_device.has_aspect_fp16: + Y = dpt.astype(X[::2, ::-1], "f2", order="K") + assert np.allclose(dpt.to_numpy(Y), np.full(Y.shape, 7, dtype="f2")) + Y = dpt.astype(X[::2, ::-1], "f4", order="K") + assert np.allclose(dpt.to_numpy(Y), np.full(Y.shape, 7, dtype="f4")) + Y = dpt.astype(X[::2, ::-1], "i4", order="K", copy=False) + assert Y.usm_data is X.usm_data + Y = dpt.astype(X, None, order="K") + if X.sycl_queue.sycl_device.has_aspect_fp64: + assert Y.dtype is dpt.float64 + else: + assert Y.dtype is dpt.float32 + + +def test_astype_invalid_order(): + try: + X = dpt.usm_ndarray(5, "i4") + except dpctl.SyclDeviceCreationError: + pytest.skip("No SYCL devices available") + with pytest.raises(ValueError): + dpt.astype(X, "i4", order="WRONG") + + +def test_astype_device(): + get_queue_or_skip() + q1 = dpctl.SyclQueue() + q2 = dpctl.SyclQueue() + + x = dpt.arange(5, dtype="i4", sycl_queue=q1) + r = dpt.astype(x, "f4") + assert r.sycl_queue == x.sycl_queue + assert r.sycl_device == x.sycl_device + + r = dpt.astype(x, "f4", device=q2) + assert r.sycl_queue == q2 + + +def test_astype_gh_1926(): + get_queue_or_skip() + + x = dpt.ones(64) + x_ = dpt.astype(x, x.dtype, copy=False, order="C") + assert x is x_ + + x__ = dpt.astype(x, x.dtype, copy=False, order="F") + assert x is x__ + + +def test_astype_gh_2121(): + get_queue_or_skip() + + x_np = np.asarray([0, 3, 1, 2, 0, 1], dtype="u1").view("?") + x = dpt.asarray(x_np) + res = dpt.astype(x, dpt.uint8) + expected = dpt.asarray([0, 1, 1, 1, 0, 1], dtype="u1") + assert dpt.all(res == expected) + + +def test_copy(): + try: + X = dpt.usm_ndarray((5, 5), "i4")[2:4, 1:4] + except dpctl.SyclDeviceCreationError: + pytest.skip("No SYCL devices available") + X[:] = 42 + Yc = dpt.copy(X, order="C") + Yf = dpt.copy(X, order="F") + Ya = dpt.copy(X, order="A") + Yk = dpt.copy(X, order="K") + assert Yc.usm_data is not X.usm_data + assert Yf.usm_data is not X.usm_data + assert Ya.usm_data is not X.usm_data + assert Yk.usm_data is not X.usm_data + assert Yc.strides == (3, 1) + assert Yf.strides == (1, 2) + assert Ya.strides == (3, 1) + assert Yk.strides == (3, 1) + ref = np.full(X.shape, 42, dtype=X.dtype) + assert np.array_equal(dpt.asnumpy(Yc), ref) + assert np.array_equal(dpt.asnumpy(Yf), ref) + assert np.array_equal(dpt.asnumpy(Ya), ref) + assert np.array_equal(dpt.asnumpy(Yk), ref) + + +def test_copy_unaligned(): + get_queue_or_skip() + + x = dpt.ones(513, dtype="i4") + r = dpt.astype(x[1:], "f4") + + assert dpt.all(r == 1) + + +def test_ctor_invalid(): + try: + m = dpm.MemoryUSMShared(12) + except dpctl.SyclDeviceCreationError: + pytest.skip("No SYCL devices available") + with pytest.raises(ValueError): + dpt.usm_ndarray((4,), dtype="i4", buffer=m) + m = dpm.MemoryUSMShared(64) + with pytest.raises(ValueError): + dpt.usm_ndarray((4,), dtype="u1", buffer=m, strides={"not": "valid"}) + + +def test_reshape(): + try: + X = dpt.usm_ndarray((5, 5), "i4") + except dpctl.SyclDeviceCreationError: + pytest.skip("No SYCL devices available") + # can be done as views + Y = dpt.reshape(X, (25,)) + assert Y.shape == (25,) + Z = X[::2, ::2] + # requires a copy + W = dpt.reshape(Z, (Z.size,), order="F") + assert W.shape == (Z.size,) + with pytest.raises(TypeError): + dpt.reshape("invalid") + with pytest.raises(ValueError): + dpt.reshape(Z, (2, 2, 2, 2, 2)) + with pytest.raises(ValueError): + dpt.reshape(Z, Z.shape, order="invalid") + W = dpt.reshape(Z, (-1,), order="C") + assert W.shape == (Z.size,) + + X = dpt.usm_ndarray((1,), dtype="i8") + Y = dpt.reshape(X, X.shape) + assert Y.flags == X.flags + + A = dpt.usm_ndarray((0,), "i4") + A1 = dpt.reshape(A, (0,)) + assert A1.shape == (0,) + requested_shape = ( + 2, + 0, + ) + A2 = dpt.reshape(A, requested_shape) + assert A2.shape == requested_shape + requested_shape = ( + 0, + 2, + ) + A3 = dpt.reshape(A, requested_shape) + assert A3.shape == requested_shape + requested_shape = ( + 1, + 0, + 2, + ) + A4 = dpt.reshape(A, requested_shape) + assert A4.shape == requested_shape + + +def test_reshape_orderF(): + try: + a = dpt.arange(6 * 3 * 4, dtype="i4") + except dpctl.SyclDeviceCreationError: + pytest.skip("No SYCL devices available") + b = dpt.reshape(a, (6, 2, 6)) + c = dpt.reshape(b, (9, 8), order="F") + assert c.flags.f_contiguous + assert c._pointer != b._pointer + assert b._pointer == a._pointer + + a_np = np.arange(6 * 3 * 4, dtype="i4") + b_np = np.reshape(a_np, (6, 2, 6)) + c_np = np.reshape(b_np, (9, 8), order="F") + assert np.array_equal(c_np, dpt.asnumpy(c)) + + +def test_reshape_noop(): + """Per gh-1664""" + try: + a = dpt.ones((2, 1)) + except dpctl.SyclDeviceCreationError: + pytest.skip("No SYCL devices available") + b = dpt.reshape(a, (2, 1)) + assert b is a + + +def test_reshape_zero_size(): + try: + a = dpt.empty((0,)) + except dpctl.SyclDeviceCreationError: + pytest.skip("No SYCL devices available") + with pytest.raises(ValueError): + dpt.reshape(a, (-1, 0)) + + +def test_reshape_large_ndim(): + ndim = 32 + idx = tuple(1 if i + 1 < ndim else ndim for i in range(ndim)) + try: + d = dpt.ones(ndim, dtype="i4") + except dpctl.SyclDeviceCreationError: + pytest.skip("No SYCL devices available") + d = dpt.reshape(d, idx) + assert d.shape == idx + + +def test_reshape_copy_kwrd(): + try: + X = dpt.usm_ndarray((2, 3), "i4") + except dpctl.SyclDeviceCreationError: + pytest.skip("No SYCL devices available") + new_shape = (6,) + Z = dpt.reshape(X, new_shape, copy=True) + assert Z.shape == new_shape + assert Z.usm_data is not X.usm_data + X = dpt.usm_ndarray((3, 3), "i4")[::2, ::2] + new_shape = (4,) + with pytest.raises(ValueError): + Z = dpt.reshape(X, new_shape, copy=False) + with pytest.raises(ValueError): + invalid = Ellipsis + Z = dpt.reshape(X, new_shape, copy=invalid) + + +def test_transpose(): + n, m = 2, 3 + try: + X = dpt.usm_ndarray((n, m), "f4") + except dpctl.SyclDeviceCreationError: + pytest.skip("No SYCL devices available") + Xnp = np.arange(n * m, dtype="f4").reshape((n, m)) + X[:] = Xnp + assert np.array_equal(dpt.to_numpy(X.T), Xnp.T) + assert np.array_equal(dpt.to_numpy(X[1:].T), Xnp[1:].T) + + +def test_real_imag_views(): + n, m = 2, 3 + try: + X = dpt.usm_ndarray((n, m), "c8") + X_scalar = dpt.usm_ndarray((), dtype="c8") + except dpctl.SyclDeviceCreationError: + pytest.skip("No SYCL devices available") + Xnp_r = np.arange(n * m, dtype="f4").reshape((n, m)) + Xnp_i = np.arange(n * m, 2 * n * m, dtype="f4").reshape((n, m)) + Xnp = Xnp_r + 1j * Xnp_i + X[:] = Xnp + X_real = X.real + X_imag = X.imag + assert np.array_equal(dpt.to_numpy(X_real), Xnp.real) + assert np.array_equal(dpt.to_numpy(X.imag), Xnp.imag) + assert not X_real.flags["C"] and not X_real.flags["F"] + assert not X_imag.flags["C"] and not X_imag.flags["F"] + assert X_real.strides == X_imag.strides + assert np.array_equal(dpt.to_numpy(X[1:].real), Xnp[1:].real) + assert np.array_equal(dpt.to_numpy(X[1:].imag), Xnp[1:].imag) + + X_scalar[...] = complex(n * m, 2 * n * m) + assert X_scalar.real and X_scalar.imag + + # check that _zero_like works for scalars + X_scalar = dpt.usm_ndarray((), dtype="f4") + assert isinstance(X_scalar.imag, dpt.usm_ndarray) + assert not X_scalar.imag + assert X_scalar.real.sycl_queue == X_scalar.imag.sycl_queue + + +def test_real_imag_views_fp16(): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dpt.float16, q) + + X = dpt.usm_ndarray( + (3, 4), dtype=dpt.float16, buffer_ctor_kwargs={"queue": q} + ) + assert isinstance(X.real, dpt.usm_ndarray) and isinstance( + X.imag, dpt.usm_ndarray + ) + + +@pytest.mark.parametrize( + "dtype", + _all_dtypes, +) +def test_zeros(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + X = dpt.zeros(10, dtype=dtype, sycl_queue=q) + assert np.array_equal(dpt.asnumpy(X), np.zeros(10, dtype=dtype)) + + +@pytest.mark.parametrize( + "dtype", + _all_dtypes, +) +def test_ones(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + X = dpt.ones(10, dtype=dtype, sycl_queue=q) + assert np.array_equal(dpt.asnumpy(X), np.ones(10, dtype=dtype)) + + +@pytest.mark.parametrize( + "dtype", + _all_dtypes, +) +def test_full(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + X = dpt.full(10, 4, dtype=dtype, sycl_queue=q) + assert np.array_equal(dpt.asnumpy(X), np.full(10, 4, dtype=dtype)) + + +def test_full_cmplx128(): + q = get_queue_or_skip() + dtype = "c16" + skip_if_dtype_not_supported(dtype, q) + fill_v = 1 + 1j + X = dpt.full(tuple(), fill_value=fill_v, dtype=dtype, sycl_queue=q) + assert np.array_equal( + dpt.asnumpy(X), np.full(tuple(), fill_value=fill_v, dtype=dtype) + ) + fill_v = 0 + 1j + X = dpt.full(tuple(), fill_value=fill_v, dtype=dtype, sycl_queue=q) + assert np.array_equal( + dpt.asnumpy(X), np.full(tuple(), fill_value=fill_v, dtype=dtype) + ) + fill_v = 0 + 0j + X = dpt.full(tuple(), fill_value=fill_v, dtype=dtype, sycl_queue=q) + assert np.array_equal( + dpt.asnumpy(X), np.full(tuple(), fill_value=fill_v, dtype=dtype) + ) + + +def test_full_dtype_inference(): + try: + X = dpt.full(10, 4) + except dpctl.SyclDeviceCreationError: + pytest.skip("No SYCL devices available") + assert np.issubdtype(X.dtype, np.integer) + try: + X = dpt.full(10, True) + except dpctl.SyclDeviceCreationError: + pytest.skip("No SYCL devices available") + assert X.dtype is dpt.dtype(np.bool_) + assert np.issubdtype(dpt.full(10, 12.3).dtype, np.floating) + try: + X = dpt.full(10, 0.3 - 2j) + except dpctl.SyclDeviceCreationError: + pytest.skip("No SYCL devices available") + cdt = X.dtype + assert np.issubdtype(cdt, np.complexfloating) + + assert np.issubdtype(dpt.full(10, 12.3, dtype=int).dtype, np.integer) + assert np.issubdtype(dpt.full(10, 0.3 - 2j, dtype=int).dtype, np.integer) + rdt = np.finfo(cdt).dtype + assert np.issubdtype(dpt.full(10, 0.3 - 2j, dtype=rdt).dtype, np.floating) + + +@pytest.mark.parametrize("dt", ["f2", "f4", "f8"]) +def test_full_special_fp(dt): + """See gh-1314""" + q = get_queue_or_skip() + skip_if_dtype_not_supported(dt, q) + + ar = dpt.full(10, fill_value=dpt.nan) + err_msg = f"Failed for fill_value=dpt.nan and dtype {dt}" + assert dpt.isnan(ar[0]), err_msg + + ar = dpt.full(10, fill_value=dpt.inf) + err_msg = f"Failed for fill_value=dpt.inf and dtype {dt}" + assert dpt.isinf(ar[0]) and dpt.greater(ar[0], 0), err_msg + + ar = dpt.full(10, fill_value=-dpt.inf) + err_msg = f"Failed for fill_value=-dpt.inf and dtype {dt}" + assert dpt.isinf(ar[0]) and dpt.less(ar[0], 0), err_msg + + ar = dpt.full(10, fill_value=dpt.pi) + err_msg = f"Failed for fill_value=dpt.pi and dtype {dt}" + check = abs(float(ar[0]) - dpt.pi) < 16 * dpt.finfo(ar.dtype).eps + assert check, err_msg + + +def test_full_fill_array(): + q = get_queue_or_skip() + + Xnp = np.array([1, 2, 3], dtype="i4") + X = dpt.asarray(Xnp, sycl_queue=q) + + shape = (3, 3) + Y = dpt.full(shape, X) + Ynp = np.full(shape, Xnp) + + assert Y.dtype == Ynp.dtype + assert Y.usm_type == "device" + assert np.array_equal(dpt.asnumpy(Y), Ynp) + + +def test_full_compute_follows_data(): + q1 = get_queue_or_skip() + q2 = get_queue_or_skip() + + X = dpt.arange(10, dtype="i4", sycl_queue=q1, usm_type="shared") + Y = dpt.full(10, X[3]) + + assert Y.dtype == X.dtype + assert Y.usm_type == X.usm_type + assert dpt.get_execution_queue((Y.sycl_queue, X.sycl_queue)) + assert np.array_equal(dpt.asnumpy(Y), np.full(10, 3, dtype="i4")) + + Y = dpt.full(10, X[3], dtype="f4", sycl_queue=q2, usm_type="host") + + assert Y.dtype == dpt.dtype("f4") + assert Y.usm_type == "host" + assert dpt.get_execution_queue((Y.sycl_queue, q2)) + assert np.array_equal(dpt.asnumpy(Y), np.full(10, 3, dtype="f4")) + + +@pytest.mark.parametrize("order1", ["F", "C"]) +@pytest.mark.parametrize("order2", ["F", "C"]) +def test_full_order(order1, order2): + q = get_queue_or_skip() + Xnp = np.array([1, 2, 3], order=order1) + Ynp = np.full((3, 3), Xnp, order=order2) + Y = dpt.full((3, 3), Xnp, order=order2, sycl_queue=q) + assert Y.flags.c_contiguous == Ynp.flags.c_contiguous + assert Y.flags.f_contiguous == Ynp.flags.f_contiguous + assert np.array_equal(dpt.asnumpy(Y), Ynp) + + +def test_full_strides(): + q = get_queue_or_skip() + X = dpt.full((3, 3), dpt.arange(3, dtype="i4"), sycl_queue=q) + Xnp = np.full((3, 3), np.arange(3, dtype="i4")) + assert X.strides == tuple(el // Xnp.itemsize for el in Xnp.strides) + assert np.array_equal(dpt.asnumpy(X), Xnp) + + X = dpt.full((3, 3), dpt.arange(6, dtype="i4")[::2], sycl_queue=q) + Xnp = np.full((3, 3), np.arange(6, dtype="i4")[::2]) + assert X.strides == tuple(el // Xnp.itemsize for el in Xnp.strides) + assert np.array_equal(dpt.asnumpy(X), Xnp) + + +@pytest.mark.parametrize("dt", ["i1", "u1", "i2", "u2", "i4", "u4", "i8", "u8"]) +def test_full_gh_1230(dt): + get_queue_or_skip() + dtype = dpt.dtype(dt) + dt_maxint = dpt.iinfo(dtype).max + + if (dtype.itemsize < 8) and (np.lib.NumpyVersion(np.__version__) < "2.0.0"): + try: + X = dpt.full(1, fill_value=(dt_maxint + 1), dtype=dt) + except OverflowError: + pytest.skip("Expected OverflowError raised") + Y = dpt.full_like(X, fill_value=dpt.iinfo(dt).min) + assert dpt.all(X == Y) + else: + with pytest.raises(OverflowError): + dpt.full(1, dt_maxint + 1, dtype=dt) + + +@pytest.mark.parametrize( + "dt", + _all_dtypes[1:], +) +def test_arange(dt): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dt, q) + X = dpt.arange(0, 123, dtype=dt, sycl_queue=q) + dt = dpt.dtype(dt) + if np.issubdtype(dt, np.integer): + assert int(X[47]) == 47 + elif np.issubdtype(dt, np.floating): + assert float(X[47]) == 47.0 + elif np.issubdtype(dt, np.complexfloating): + assert complex(X[47]) == 47.0 + 0.0j + + # choose size larger than maximal value that u1/u2 can accommodate + sz = int(dpt.iinfo(dpt.int8).max) + X1 = dpt.arange(sz + 1, dtype=dt, sycl_queue=q) + assert X1.shape == (sz + 1,) + + X2 = dpt.arange(sz, 0, -1, dtype=dt, sycl_queue=q) + assert X2.shape == (sz,) + + +def test_arange_fp(): + q = get_queue_or_skip() + + assert dpt.arange(7, 0, -2, dtype="f4", device=q).shape == (4,) + assert dpt.arange(0, 1, 0.25, dtype="f4", device=q).shape == (4,) + + has_fp64 = q.sycl_device.has_aspect_fp64 + if has_fp64: + assert dpt.arange(7, 0, -2, dtype="f8", device=q).shape == (4,) + assert dpt.arange(0, 1, 0.25, dtype="f4", device=q).shape == (4,) + + x = dpt.arange(9.7, stop=10, sycl_queue=q) + assert x.shape == (1,) + assert x.dtype == dpt.float64 if has_fp64 else dpt.float32 + + +def test_arange_step_None(): + q = get_queue_or_skip() + + x = dpt.arange(0, stop=10, step=None, dtype="int32", sycl_queue=q) + assert x.shape == (10,) + + +def test_arange_bool(): + q = get_queue_or_skip() + + x = dpt.arange(0, stop=2, dtype="bool", sycl_queue=q) + assert x.shape == (2,) + assert x.dtype == dpt.bool + + +def test_arange_mixed_types(): + q = get_queue_or_skip() + + x = dpt.arange(-2.5, stop=200, step=100, dtype="int32", sycl_queue=q) + assert x.shape[0] == 3 + assert int(x[1]) == 99 + int(x[0]) + + x = dpt.arange(+2.5, stop=200, step=100, dtype="int32", device=x.device) + assert x.shape[0] == 2 + assert int(x[1]) == 100 + int(x[0]) + + _stop = np.float32(504) + x = dpt.arange(0, stop=_stop, step=100, dtype="f4", device=x.device) + assert x.shape == (6,) + + # ensure length is determined using uncast parameters + x = dpt.arange(-5, stop=10**2, step=2.7, dtype="int64", device=x.device) + assert x.shape == (39,) + + +@pytest.mark.parametrize( + "dt", + _all_dtypes, +) +def test_linspace(dt): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dt, q) + X = dpt.linspace(0, 1, num=2, dtype=dt, sycl_queue=q) + assert np.allclose(dpt.asnumpy(X), np.linspace(0, 1, num=2, dtype=dt)) + + +def test_linspace_fp(): + q = get_queue_or_skip() + n = 16 + X = dpt.linspace(0, n - 1, num=n, sycl_queue=q) + if q.sycl_device.has_aspect_fp64: + assert X.dtype == dpt.dtype("float64") + else: + assert X.dtype == dpt.dtype("float32") + assert X.shape == (n,) + assert X.strides == (1,) + + +@pytest.mark.parametrize("dtype", ["f2", "f4", "f8"]) +def test_linspace_fp_max(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + n = 16 + dt = dpt.dtype(dtype) + max_ = dpt.finfo(dt).max + X = dpt.linspace(max_, max_, endpoint=True, num=n, dtype=dt, sycl_queue=q) + assert X.shape == (n,) + assert X.strides == (1,) + assert np.allclose( + dpt.asnumpy(X), np.linspace(max_, max_, endpoint=True, num=n, dtype=dt) + ) + + +def test_linspace_int(): + q = get_queue_or_skip() + X = dpt.linspace(0.1, 9.1, 11, endpoint=True, dtype=int, sycl_queue=q) + Xnp = np.linspace(0.1, 9.1, 11, endpoint=True, dtype=int) + assert np.array_equal(dpt.asnumpy(X), Xnp) + + +@pytest.mark.parametrize( + "dt", + _all_dtypes, +) +@pytest.mark.parametrize( + "usm_kind", + [ + "shared", + "device", + "host", + ], +) +def test_empty_like(dt, usm_kind): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dt, q) + + X = dpt.empty((4, 5), dtype=dt, usm_type=usm_kind, sycl_queue=q) + Y = dpt.empty_like(X) + assert X.shape == Y.shape + assert X.dtype == Y.dtype + assert X.usm_type == Y.usm_type + assert X.sycl_queue == Y.sycl_queue + + X = dpt.empty(tuple(), dtype=dt, usm_type=usm_kind, sycl_queue=q) + Y = dpt.empty_like(X) + assert X.shape == Y.shape + assert X.dtype == Y.dtype + assert X.usm_type == Y.usm_type + assert X.sycl_queue == Y.sycl_queue + + +def test_empty_unexpected_data_type(): + with pytest.raises(TypeError): + try: + dpt.empty(1, dtype=np.object_) + except dpctl.SyclDeviceCreationError: + pytest.skip("No SYCL devices available") + + +@pytest.mark.parametrize( + "dt", + _all_dtypes, +) +@pytest.mark.parametrize( + "usm_kind", + [ + "shared", + "device", + "host", + ], +) +def test_zeros_like(dt, usm_kind): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dt, q) + + X = dpt.empty((4, 5), dtype=dt, usm_type=usm_kind, sycl_queue=q) + Y = dpt.zeros_like(X) + assert X.shape == Y.shape + assert X.dtype == Y.dtype + assert X.usm_type == Y.usm_type + assert X.sycl_queue == Y.sycl_queue + assert np.allclose(dpt.asnumpy(Y), np.zeros(X.shape, dtype=X.dtype)) + + X = dpt.empty(tuple(), dtype=dt, usm_type=usm_kind, sycl_queue=q) + Y = dpt.zeros_like(X) + assert X.shape == Y.shape + assert X.dtype == Y.dtype + assert X.usm_type == Y.usm_type + assert X.sycl_queue == Y.sycl_queue + assert np.array_equal(dpt.asnumpy(Y), np.zeros(X.shape, dtype=X.dtype)) + + +@pytest.mark.parametrize( + "dt", + _all_dtypes, +) +@pytest.mark.parametrize( + "usm_kind", + [ + "shared", + "device", + "host", + ], +) +def test_ones_like(dt, usm_kind): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dt, q) + + X = dpt.empty((4, 5), dtype=dt, usm_type=usm_kind, sycl_queue=q) + Y = dpt.ones_like(X) + assert X.shape == Y.shape + assert X.dtype == Y.dtype + assert X.usm_type == Y.usm_type + assert X.sycl_queue == Y.sycl_queue + assert np.allclose(dpt.asnumpy(Y), np.ones(X.shape, dtype=X.dtype)) + + X = dpt.empty(tuple(), dtype=dt, usm_type=usm_kind, sycl_queue=q) + Y = dpt.ones_like(X) + assert X.shape == Y.shape + assert X.dtype == Y.dtype + assert X.usm_type == Y.usm_type + assert X.sycl_queue == Y.sycl_queue + assert np.array_equal(dpt.asnumpy(Y), np.ones(X.shape, dtype=X.dtype)) + + +@pytest.mark.parametrize( + "dt", + _all_dtypes, +) +@pytest.mark.parametrize( + "usm_kind", + [ + "shared", + "device", + "host", + ], +) +def test_full_like(dt, usm_kind): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dt, q) + + fill_v = dpt.dtype(dt).type(1) + X = dpt.empty((4, 5), dtype=dt, usm_type=usm_kind, sycl_queue=q) + Y = dpt.full_like(X, fill_v) + assert X.shape == Y.shape + assert X.dtype == Y.dtype + assert X.usm_type == Y.usm_type + assert X.sycl_queue == Y.sycl_queue + assert np.allclose(dpt.asnumpy(Y), np.ones(X.shape, dtype=X.dtype)) + + X = dpt.empty(tuple(), dtype=dt, usm_type=usm_kind, sycl_queue=q) + Y = dpt.full_like(X, fill_v) + assert X.shape == Y.shape + assert X.dtype == Y.dtype + assert X.usm_type == Y.usm_type + assert X.sycl_queue == Y.sycl_queue + assert np.array_equal(dpt.asnumpy(Y), np.ones(X.shape, dtype=X.dtype)) + + +@pytest.mark.parametrize("dtype", _all_dtypes) +@pytest.mark.parametrize("usm_kind", ["shared", "device", "host"]) +def test_eye(dtype, usm_kind): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + X = dpt.eye(4, 5, k=1, dtype=dtype, usm_type=usm_kind, sycl_queue=q) + Xnp = np.eye(4, 5, k=1, dtype=dtype) + assert X.dtype == Xnp.dtype + assert np.array_equal(Xnp, dpt.asnumpy(X)) + + +@pytest.mark.parametrize("dtype", _all_dtypes[1:]) +def test_tril(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + shape = (2, 3, 4, 5, 5) + X = dpt.reshape(dpt.arange(prod(shape), dtype=dtype, sycl_queue=q), shape) + Y = dpt.tril(X) + Xnp = np.arange(prod(shape), dtype=dtype).reshape(shape) + Ynp = np.tril(Xnp) + assert Y.dtype == Ynp.dtype + assert np.array_equal(Ynp, dpt.asnumpy(Y)) + + +@pytest.mark.parametrize("dtype", _all_dtypes[1:]) +def test_triu(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + shape = (4, 5) + X = dpt.reshape(dpt.arange(prod(shape), dtype=dtype, sycl_queue=q), shape) + Y = dpt.triu(X, k=1) + Xnp = np.arange(prod(shape), dtype=dtype).reshape(shape) + Ynp = np.triu(Xnp, k=1) + assert Y.dtype == Ynp.dtype + assert np.array_equal(Ynp, dpt.asnumpy(Y)) + + +@pytest.mark.parametrize("tri_fn", [dpt.tril, dpt.triu]) +@pytest.mark.parametrize("usm_type", ["device", "shared", "host"]) +def test_tri_usm_type(tri_fn, usm_type): + q = get_queue_or_skip() + dtype = dpt.uint16 + + shape = (2, 3, 4, 5, 5) + size = prod(shape) + X = dpt.reshape( + dpt.arange(size, dtype=dtype, usm_type=usm_type, sycl_queue=q), shape + ) + Y = tri_fn(X) # main execution branch + assert Y.usm_type == X.usm_type + assert Y.sycl_queue == q + Y = tri_fn(X, k=-6) # special case of Y == X + assert Y.usm_type == X.usm_type + assert Y.sycl_queue == q + Y = tri_fn(X, k=6) # special case of Y == 0 + assert Y.usm_type == X.usm_type + assert Y.sycl_queue == q + + +def test_tril_slice(): + q = get_queue_or_skip() + + shape = (6, 10) + X = dpt.reshape(dpt.arange(prod(shape), dtype="int", sycl_queue=q), shape)[ + 1:, ::-2 + ] + Y = dpt.tril(X) + Xnp = np.arange(prod(shape), dtype="int").reshape(shape)[1:, ::-2] + Ynp = np.tril(Xnp) + assert Y.dtype == Ynp.dtype + assert np.array_equal(Ynp, dpt.asnumpy(Y)) + + +def test_triu_permute_dims(): + q = get_queue_or_skip() + + shape = (2, 3, 4, 5) + X = dpt.permute_dims( + dpt.reshape(dpt.arange(prod(shape), dtype="int", sycl_queue=q), shape), + (3, 2, 1, 0), + ) + Y = dpt.triu(X) + Xnp = np.transpose( + np.arange(prod(shape), dtype="int").reshape(shape), (3, 2, 1, 0) + ) + Ynp = np.triu(Xnp) + assert Y.dtype == Ynp.dtype + assert np.array_equal(Ynp, dpt.asnumpy(Y)) + + +def test_tril_broadcast_to(): + q = get_queue_or_skip() + + shape = (5, 5) + X = dpt.broadcast_to(dpt.ones((1), dtype="int", sycl_queue=q), shape) + Y = dpt.tril(X) + Xnp = np.broadcast_to(np.ones((1), dtype="int"), shape) + Ynp = np.tril(Xnp) + assert Y.dtype == Ynp.dtype + assert np.array_equal(Ynp, dpt.asnumpy(Y)) + + +def test_triu_bool(): + q = get_queue_or_skip() + + shape = (4, 5) + X = dpt.ones((shape), dtype="bool", sycl_queue=q) + Y = dpt.triu(X) + Xnp = np.ones((shape), dtype="bool") + Ynp = np.triu(Xnp) + assert Y.dtype == Ynp.dtype + assert np.array_equal(Ynp, dpt.asnumpy(Y)) + + +@pytest.mark.parametrize("order", ["F", "C"]) +@pytest.mark.parametrize("k", [-10, -2, -1, 3, 4, 10]) +def test_triu_order_k(order, k): + q = get_queue_or_skip() + + shape = (3, 3) + X = dpt.reshape( + dpt.arange(prod(shape), dtype="int", sycl_queue=q), + shape, + order=order, + ) + Y = dpt.triu(X, k=k) + Xnp = np.arange(prod(shape), dtype="int").reshape(shape, order=order) + Ynp = np.triu(Xnp, k=k) + assert Y.dtype == Ynp.dtype + assert X.flags == Y.flags + assert np.array_equal(Ynp, dpt.asnumpy(Y)) + + +@pytest.mark.parametrize("order", ["F", "C"]) +@pytest.mark.parametrize("k", [-10, -4, -3, 1, 2, 10]) +def test_tril_order_k(order, k): + try: + q = dpctl.SyclQueue() + except dpctl.SyclQueueCreationError: + pytest.skip("Queue could not be created") + shape = (3, 3) + X = dpt.reshape( + dpt.arange(prod(shape), dtype="int", sycl_queue=q), + shape, + order=order, + ) + Y = dpt.tril(X, k=k) + Xnp = np.arange(prod(shape), dtype="int").reshape(shape, order=order) + Ynp = np.tril(Xnp, k=k) + assert Y.dtype == Ynp.dtype + assert X.flags == Y.flags + assert np.array_equal(Ynp, dpt.asnumpy(Y)) + + +def test_meshgrid(): + q = get_queue_or_skip() + + X = dpt.arange(5, sycl_queue=q) + Y = dpt.arange(3, sycl_queue=q) + Z = dpt.meshgrid(X, Y) + Znp = np.meshgrid(dpt.asnumpy(X), dpt.asnumpy(Y)) + n = len(Z) + assert n == len(Znp) + for i in range(n): + assert np.array_equal(dpt.asnumpy(Z[i]), Znp[i]) + assert dpt.meshgrid() == [] + # dimension > 1 must raise ValueError + with pytest.raises(ValueError): + dpt.meshgrid(dpt.usm_ndarray((4, 4))) + # unknown indexing kwarg must raise ValueError + with pytest.raises(ValueError): + dpt.meshgrid(X, indexing="ji") + # input arrays with different data types must raise ValueError + with pytest.raises(ValueError): + dpt.meshgrid(X, dpt.asarray(Y, dtype="b1")) + + +def test_meshgrid2(): + q1 = get_queue_or_skip() + q2 = get_queue_or_skip() + q3 = get_queue_or_skip() + + x1 = dpt.arange(0, 2, dtype="int16", sycl_queue=q1) + x2 = dpt.arange(3, 6, dtype="int16", sycl_queue=q2) + x3 = dpt.arange(6, 10, dtype="int16", sycl_queue=q3) + y1, y2, y3 = dpt.meshgrid(x1, x2, x3, indexing="xy") + z1, z2, z3 = dpt.meshgrid(x1, x2, x3, indexing="ij") + assert all( + x.sycl_queue == y.sycl_queue for x, y in zip((x1, x2, x3), (y1, y2, y3)) + ) + assert all( + x.sycl_queue == z.sycl_queue for x, z in zip((x1, x2, x3), (z1, z2, z3)) + ) + assert y1.shape == y2.shape and y2.shape == y3.shape + assert z1.shape == z2.shape and z2.shape == z3.shape + assert y1.shape == (len(x2), len(x1), len(x3)) + assert z1.shape == (len(x1), len(x2), len(x3)) + + +def test_common_arg_validation(): + order = "I" + # invalid order must raise ValueError + with pytest.raises(ValueError): + dpt.empty(10, order=order) + with pytest.raises(ValueError): + dpt.zeros(10, order=order) + with pytest.raises(ValueError): + dpt.ones(10, order=order) + with pytest.raises(ValueError): + dpt.full(10, 1, order=order) + with pytest.raises(ValueError): + dpt.eye(10, order=order) + try: + X = dpt.empty(10) + except dpctl.SyclDeviceCreationError: + pytest.skip("No SYCL devices available") + with pytest.raises(ValueError): + dpt.empty_like(X, order=order) + with pytest.raises(ValueError): + dpt.zeros_like(X, order=order) + with pytest.raises(ValueError): + dpt.ones_like(X, order=order) + with pytest.raises(ValueError): + dpt.full_like(X, 1, order=order) + X = {} + # test for type validation + with pytest.raises(TypeError): + dpt.empty_like(X) + with pytest.raises(TypeError): + dpt.zeros_like(X) + with pytest.raises(TypeError): + dpt.ones_like(X) + with pytest.raises(TypeError): + dpt.full_like(X, 1) + with pytest.raises(TypeError): + dpt.tril(X) + with pytest.raises(TypeError): + dpt.triu(X) + with pytest.raises(TypeError): + dpt.meshgrid(X) + + +def test_flags(): + try: + x = dpt.empty(tuple(), dtype="i4") + except dpctl.SyclDeviceCreationError: + pytest.skip("No SYCL devices available") + f = x.flags + # check comparison with generic types + assert f != Ellipsis + f.__repr__() + assert f.c_contiguous == f["C"] + assert f.f_contiguous == f["F"] + assert f.contiguous == f["CONTIGUOUS"] + assert f.fc == f["FC"] + assert f.forc == f["FORC"] + assert f.fnc == f["FNC"] + assert f.writable == f["W"] + + +def test_asarray_uint64(): + Xnp = np.ndarray(1, dtype=np.uint64) + try: + X = dpt.asarray(Xnp) + except dpctl.SyclDeviceCreationError: + pytest.skip("No SYCL devices available") + assert X.dtype == Xnp.dtype + + +def test_Device(): + try: + dev = dpctl.select_default_device() + d1 = dpt.Device.create_device(dev) + d2 = dpt.Device.create_device(dev) + except (dpctl.SyclQueueCreationError, dpctl.SyclDeviceCreationError): + pytest.skip( + "Could not create default device, or a queue that targets it" + ) + assert d1 == d2 + dict = {d1: 1} + assert dict[d2] == 1 + assert d1 == d2.sycl_queue + assert not d1 == Ellipsis + + +def test_element_offset(): + n0, n1 = 3, 8 + try: + x = dpt.empty((n0, n1), dtype="i4") + except dpctl.SyclDeviceCreationError: + pytest.skip("No SYCL devices available") + assert isinstance(x._element_offset, int) + assert x._element_offset == 0 + y = x[::-1, ::2] + assert y._element_offset == (n0 - 1) * n1 + + +def test_byte_bounds(): + n0, n1 = 3, 8 + try: + x = dpt.empty((n0, n1), dtype="i4") + except dpctl.SyclDeviceCreationError: + pytest.skip("No SYCL devices available") + assert isinstance(x._byte_bounds, tuple) + assert len(x._byte_bounds) == 2 + lo, hi = x._byte_bounds + assert hi - lo == n0 * n1 * x.itemsize + y = x[::-1, ::2] + lo, hi = y._byte_bounds + assert hi - lo == (n0 * n1 - 1) * x.itemsize + + +def test_gh_1201(): + n = 100 + a = np.flipud(np.arange(n, dtype="i4")) + try: + b = dpt.asarray(a) + except dpctl.SyclDeviceCreationError: + pytest.skip("No SYCL devices available") + assert (dpt.asnumpy(b) == a).all() + c = dpt.flip(dpt.empty(a.shape, dtype=a.dtype)) + c[:] = a + assert (dpt.asnumpy(c) == a).all() + + +class ObjWithSyclUsmArrayInterface: + def __init__(self, ary): + self._array_obj = ary + + @property + def __sycl_usm_array_interface__(self): + _suai = self._array_obj.__sycl_usm_array_interface__ + return _suai + + +@pytest.mark.parametrize("ro_flag", [True, False]) +def test_asarray_writable_flag(ro_flag): + try: + a = dpt.empty(8) + except dpctl.SyclDeviceCreationError: + pytest.skip("No SYCL devices available") + + a.flags["W"] = not ro_flag + wrapped = ObjWithSyclUsmArrayInterface(a) + + b = dpt.asarray(wrapped) + + assert b.flags["W"] == (not ro_flag) + assert b._pointer == a._pointer + + +def test_getitem_validation(): + """Test based on gh-1785""" + try: + a = dpt.empty((2, 2, 2)) + except dpctl.SyclDeviceCreationError: + pytest.skip("No SYCL devices available") + with pytest.raises(IndexError): + a[0.0] + with pytest.raises(IndexError): + a[1, 0.0, ...] + with pytest.raises(IndexError): + a[1, 0.0, dpt.newaxis, 1] + with pytest.raises(IndexError): + a[dpt.newaxis, ..., 0.0] + with pytest.raises(IndexError): + a[dpt.newaxis, ..., 0.0, dpt.newaxis] + with pytest.raises(IndexError): + a[..., 0.0, dpt.newaxis] + with pytest.raises(IndexError): + a[:, 0.0, dpt.newaxis] + + +def test_array_like_ctors_order_K(): + get_queue_or_skip() + + sh = (10, 10) + x1 = dpt.zeros(sh, dtype="i4", order="C") + r1 = dpt.full_like(x1, 2, order="K") + assert dpt.all(r1 == 2) + assert r1.flags.c_contiguous + r2 = dpt.empty_like(x1, order="K") + assert r2.flags.c_contiguous + r3 = dpt.ones_like(x1, order="K") + assert dpt.all(r3 == 1) + assert r3.flags.c_contiguous + r4 = dpt.zeros_like(x1, order="K") + assert dpt.all(r4 == 0) + assert r4.flags.c_contiguous + + x2 = dpt.zeros(sh, dtype="i4", order="F") + r5 = dpt.full_like(x2, 2, order="K") + assert dpt.all(r5 == 2) + assert r5.flags.f_contiguous + r6 = dpt.empty_like(x2, order="K") + assert r6.flags.f_contiguous + r7 = dpt.ones_like(x2, order="K") + assert dpt.all(r7 == 1) + assert r7.flags.f_contiguous + r8 = dpt.zeros_like(x2, order="K") + assert dpt.all(r8 == 0) + assert r8.flags.f_contiguous + + x3 = dpt.zeros(sh, dtype="i4", order="C")[::-2, :5] + st_expected = (-5, 1) + r9 = dpt.full_like(x3, 2, order="K") + assert dpt.all(r1 == 2) + assert r9.strides == st_expected + assert not r9.flags.forc + r10 = dpt.empty_like(x3, order="K") + assert not r10.flags.forc + assert r10.strides == st_expected + r11 = dpt.ones_like(x3, order="K") + assert dpt.all(r11 == 1) + assert not r11.flags.forc + assert r11.strides == st_expected + r12 = dpt.zeros_like(x3, order="K") + assert dpt.all(r12 == 0) + assert not r12.flags.forc + assert r12.strides == st_expected + + +def test_array_like_ctors_order_A(): + get_queue_or_skip() + + sh = (10, 10) + x1 = dpt.zeros(sh, dtype="i4", order="C") + r1 = dpt.full_like(x1, 2, order="A") + assert dpt.all(r1 == 2) + assert r1.flags.c_contiguous + r2 = dpt.empty_like(x1, order="A") + assert r2.flags.c_contiguous + r3 = dpt.ones_like(x1, order="A") + assert dpt.all(r3 == 1) + assert r3.flags.c_contiguous + r4 = dpt.zeros_like(x1, order="A") + assert dpt.all(r4 == 0) + assert r4.flags.c_contiguous + + x2 = dpt.zeros(sh, dtype="i4", order="F") + r5 = dpt.full_like(x2, 2, order="A") + assert dpt.all(r5 == 2) + assert r5.flags.f_contiguous + r6 = dpt.empty_like(x2, order="A") + assert r6.flags.f_contiguous + r7 = dpt.ones_like(x2, order="A") + assert dpt.all(r7 == 1) + assert r7.flags.f_contiguous + r8 = dpt.zeros_like(x2, order="A") + assert dpt.all(r8 == 0) + assert r8.flags.f_contiguous + + x3 = dpt.zeros(sh, dtype="i4", order="C")[::-2, :5] + r9 = dpt.full_like(x3, 2, order="A") + assert dpt.all(r1 == 2) + assert r9.flags.c_contiguous + r10 = dpt.empty_like(x3, order="A") + assert r10.flags.c_contiguous + r11 = dpt.ones_like(x3, order="A") + assert dpt.all(r11 == 1) + assert r11.flags.c_contiguous + r12 = dpt.zeros_like(x3, order="A") + assert dpt.all(r12 == 0) + assert r12.flags.c_contiguous + + +def test_full_like_order_K_array_fill_v(): + get_queue_or_skip() + + x = dpt.zeros((10, 10), dtype="i4") + fill_v = dpt.asarray(2, dtype="i4") + + r1 = dpt.full_like(x, fill_v, order="K") + assert dpt.all(r1 == 2) + + # broadcast behavior + fill_v = dpt.arange(10, dtype="i4")[:, dpt.newaxis] + r1 = dpt.full_like(x, fill_v, order="K") + assert dpt.all(r1 == dpt.tile(fill_v, (1, 10))) + + +def test_full_like_order_K_same_input_output_queues(): + q1 = get_queue_or_skip() + q2 = get_queue_or_skip() + + x = dpt.zeros((10, 10), dtype="i4", sycl_queue=q1) + fill_v = dpt.asarray(2, dtype="i4", sycl_queue=q2) + + r = dpt.full_like(x, fill_v, order="K") + assert r.sycl_queue == x.sycl_queue + + +def test_asarray_from_numpy_contig(): + get_queue_or_skip() + + i_dt = np.int64 + Xnp = np.arange(32, dtype=i_dt) + + fp_dt = dpt.float32 + # Use contig copy kernel + Xdpt = dpt.asarray(Xnp, dtype=fp_dt) + + assert dpt.all(Xdpt == dpt.arange(32, dtype=fp_dt)) + + +def test_setitem_from_numpy_contig(): + get_queue_or_skip() + + i_dt = np.int64 + fp_dt = dpt.float32 + + Xnp = np.flip(np.arange(32, dtype=i_dt)) + Xdpt = dpt.flip(dpt.empty(Xnp.shape, dtype=fp_dt)) + # Use contig copy kernel, after stride simplification + Xdpt[:] = Xnp + + expected = dpt.arange(31, stop=-1, step=-1, dtype=fp_dt) + assert dpt.all(Xdpt == expected) + + Xnp = np.fliplr(np.reshape(np.arange(-10, 10, dtype=i_dt), (4, 5))) + Xdpt = dpt.flip(dpt.empty(Xnp.shape, dtype=fp_dt), axis=-1) + + # after stride simplification, contig kernel is used + Xdpt[:] = Xnp + + expected = dpt.reshape(dpt.arange(-10, 10, dtype=fp_dt), (4, 5)) + assert dpt.all(dpt.flip(Xdpt, axis=-1) == expected) + + +def test_full_functions_raise_type_error(): + get_queue_or_skip() + + with pytest.raises(TypeError): + dpt.full(1, "0") + + x = dpt.ones(1, dtype="i4") + with pytest.raises(TypeError): + dpt.full_like(x, "0") + + +@pytest.mark.parametrize("dt", _all_dtypes) +def test_setitem_copy_as_contig_alignment(dt): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dt, q) + + dtype_ = dpt.dtype(dt) + n0, n1 = 8, 23 + + x = dpt.zeros((n0, n1), dtype=dtype_, sycl_queue=q) + + vals = dpt.ones(n1, dtype=dtype_, sycl_queue=q)[dpt.newaxis, :] + x[1:, ...] = vals + assert dpt.all(x[0] == 0) + assert dpt.all(x[1:, :] == vals) + + +def test_asarray_property(): + get_queue_or_skip() + + x = dpt.ones(11, dtype="i4") + + with pytest.raises(TypeError): + np.asarray(x) diff --git a/dpnp/tests/tensor/test_usm_ndarray_dlpack.py b/dpnp/tests/tensor/test_usm_ndarray_dlpack.py new file mode 100644 index 000000000000..7db73467f788 --- /dev/null +++ b/dpnp/tests/tensor/test_usm_ndarray_dlpack.py @@ -0,0 +1,919 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import collections +import ctypes + +import dpctl +import numpy as np +import pytest + +import dpnp.tensor as dpt +import dpnp.tensor._dlpack as _dlp +import dpnp.tensor._usmarray as dpt_arr + +from .helper import ( + get_queue_or_skip, + skip_if_dtype_not_supported, +) + +device_CPU = dpt_arr.DLDeviceType.kDLCPU +device_oneAPI = dpt_arr.DLDeviceType.kDLOneAPI + +_usm_types_list = ["shared", "device", "host"] + + +@pytest.fixture(params=_usm_types_list) +def usm_type(request): + return request.param + + +_typestrs_list = [ + "b1", + "u1", + "i1", + "u2", + "i2", + "u4", + "i4", + "u8", + "i8", + "f2", + "f4", + "f8", + "c8", + "c16", +] + + +@pytest.fixture(params=_typestrs_list) +def typestr(request): + return request.param + + +@pytest.fixture +def all_root_devices(): + """ + Caches root devices. For the sake of speed + of test suite execution, keep at most two + devices from each platform + """ + devs = dpctl.get_devices() + devs_per_platform = collections.defaultdict(list) + for dev in devs: + devs_per_platform[dev.sycl_platform].append(dev) + + pruned = map(lambda li: li[:2], devs_per_platform.values()) + return sum(pruned, start=[]) + + +def test_dlpack_device(usm_type, all_root_devices): + for sycl_dev in all_root_devices: + X = dpt.empty((64,), dtype="u1", usm_type=usm_type, device=sycl_dev) + dev = X.__dlpack_device__() + assert type(dev) is tuple + assert len(dev) == 2 + assert dev[0] == device_oneAPI + assert dev[1] == sycl_dev.get_device_id() + + +def test_dlpack_exporter(typestr, usm_type, all_root_devices): + caps_fn = ctypes.pythonapi.PyCapsule_IsValid + caps_fn.restype = bool + caps_fn.argtypes = [ctypes.py_object, ctypes.c_char_p] + for sycl_dev in all_root_devices: + skip_if_dtype_not_supported(typestr, sycl_dev) + X = dpt.empty((64,), dtype=typestr, usm_type=usm_type, device=sycl_dev) + caps = X.__dlpack__() + assert caps_fn(caps, b"dltensor") + Y = X[::2] + caps2 = Y.__dlpack__() + assert caps_fn(caps2, b"dltensor") + + +def test_dlpack_exporter_empty(typestr, usm_type): + caps_fn = ctypes.pythonapi.PyCapsule_IsValid + caps_fn.restype = bool + caps_fn.argtypes = [ctypes.py_object, ctypes.c_char_p] + try: + sycl_dev = dpctl.select_default_device() + except dpctl.SyclDeviceCreationError: + pytest.skip("No SYCL devices available") + skip_if_dtype_not_supported(typestr, sycl_dev) + X = dpt.empty((0,), dtype=typestr, usm_type=usm_type, device=sycl_dev) + caps = X.__dlpack__() + assert caps_fn(caps, b"dltensor") + Y = dpt.empty( + ( + 1, + 0, + ), + dtype=typestr, + usm_type=usm_type, + device=sycl_dev, + ) + caps = Y.__dlpack__() + assert caps_fn(caps, b"dltensor") + + +def test_dlpack_exporter_stream(): + try: + q1 = dpctl.SyclQueue() + q2 = dpctl.SyclQueue() + except dpctl.SyclQueueCreationError: + pytest.skip("Could not create default queues") + X = dpt.empty((64,), dtype="u1", sycl_queue=q1) + cap1 = X.__dlpack__(stream=q1) + cap2 = X.__dlpack__(stream=q2) + assert type(cap1) is type(cap2) + + +@pytest.mark.parametrize("shape", [tuple(), (2,), (3, 0, 1), (2, 2, 2)]) +def test_from_dlpack(shape, typestr, usm_type, all_root_devices): + for sycl_dev in all_root_devices: + skip_if_dtype_not_supported(typestr, sycl_dev) + X = dpt.empty(shape, dtype=typestr, usm_type=usm_type, device=sycl_dev) + Y = dpt.from_dlpack(X) + assert X.shape == Y.shape + assert X.dtype == Y.dtype + assert X.usm_type == Y.usm_type + assert X._pointer == Y._pointer + # we can only expect device to round-trip for USM-device and + # USM-shared allocations, which are made for specific device + assert (Y.usm_type == "host") or (X.sycl_device == Y.sycl_device) + if Y.ndim: + V = Y[::-1] + W = dpt.from_dlpack(V) + assert V.strides == W.strides + + +@pytest.mark.parametrize("mod", [2, 5]) +def test_from_dlpack_strides(mod, typestr, usm_type, all_root_devices): + for sycl_dev in all_root_devices: + skip_if_dtype_not_supported(typestr, sycl_dev) + X0 = dpt.empty( + 3 * mod, dtype=typestr, usm_type=usm_type, device=sycl_dev + ) + for start in range(mod): + X = X0[slice(-start - 1, None, -mod)] + Y = dpt.from_dlpack(X) + assert X.shape == Y.shape + assert X.dtype == Y.dtype + assert X.usm_type == Y.usm_type + assert X._pointer == Y._pointer + # we can only expect device to round-trip for USM-device and + # USM-shared allocations, which are made for specific device + assert (Y.usm_type == "host") or (X.sycl_device == Y.sycl_device) + if Y.ndim: + V = Y[::-1] + W = dpt.from_dlpack(V) + assert V.strides == W.strides + + +def test_from_dlpack_input_validation(): + v = dpt._dlpack.get_build_dlpack_version() + assert type(v) is tuple + with pytest.raises(TypeError): + dpt.from_dlpack(None) + + class DummyWithProperty: + @property + def __dlpack__(self): + return None + + with pytest.raises(TypeError): + dpt.from_dlpack(DummyWithProperty()) + + class DummyWithMethod: + def __dlpack__(self): + return None + + with pytest.raises(TypeError): + dpt.from_dlpack(DummyWithMethod()) + + +def test_from_dlpack_fortran_contig_array_roundtripping(): + """Based on examples from issue gh-1241""" + n0, n1 = 3, 5 + try: + ar1d = dpt.arange(n0 * n1, dtype="i4") + except dpctl.SyclDeviceCreationError: + pytest.skip("No default device available") + ar2d_c = dpt.reshape(ar1d, (n0, n1), order="C") + ar2d_f = dpt.asarray(ar2d_c, order="F") + ar2d_r = dpt.from_dlpack(ar2d_f) + + assert dpt.all(dpt.equal(ar2d_f, ar2d_r)) + assert dpt.all(dpt.equal(ar2d_c, ar2d_r)) + + +def test_dlpack_from_subdevice(): + """ + This test checks that array allocated on a sub-device, + with memory bound to platform-default SyclContext can be + exported and imported via DLPack. + """ + n = 64 + try: + dev = dpctl.SyclDevice() + except dpctl.SyclDeviceCreationError: + pytest.skip("No default device available") + try: + sdevs = dev.create_sub_devices(partition="next_partitionable") + except dpctl.SyclSubDeviceCreationError: + sdevs = None + try: + if sdevs is None: + sdevs = dev.create_sub_devices(partition=[1, 1]) + except dpctl.SyclSubDeviceCreationError: + pytest.skip("Default device can not be partitioned") + assert isinstance(sdevs, list) and len(sdevs) > 0 + try: + ctx = sdevs[0].sycl_platform.default_context + except dpctl.SyclContextCreationError: + pytest.skip("Platform's default_context is not available") + try: + q = dpctl.SyclQueue(ctx, sdevs[0]) + except dpctl.SyclQueueCreationError: + pytest.skip("Queue could not be created") + + ar = dpt.arange(n, dtype=dpt.int32, sycl_queue=q) + ar2 = dpt.from_dlpack(ar) + assert ar2.sycl_device == sdevs[0] + + +def test_legacy_dlpack_capsule(): + try: + x = dpt.arange(100, dtype="i4") + except dpctl.SyclDeviceCreationError: + pytest.skip("No default device available") + + legacy_ver = (0, 8) + + cap = x.__dlpack__(max_version=legacy_ver) + y = _dlp.from_dlpack_capsule(cap) + del cap + assert x._pointer == y._pointer + + x = dpt.arange(100, dtype="u4") + x2 = dpt.reshape(x, (10, 10)).mT + cap = x2.__dlpack__(max_version=legacy_ver) + y = _dlp.from_dlpack_capsule(cap) + del cap + assert x2._pointer == y._pointer + del x2 + + x = dpt.arange(100, dtype="f4") + x2 = dpt.asarray(dpt.reshape(x, (10, 10)), order="F") + cap = x2.__dlpack__(max_version=legacy_ver) + y = _dlp.from_dlpack_capsule(cap) + del cap + assert x2._pointer == y._pointer + + x = dpt.arange(100, dtype="c8") + x3 = x[::-2] + cap = x3.__dlpack__(max_version=legacy_ver) + y = _dlp.from_dlpack_capsule(cap) + assert x3._pointer == y._pointer + del x3, y, x + del cap + + x = dpt.ones(100, dtype="?") + x4 = x[::-2] + cap = x4.__dlpack__(max_version=legacy_ver) + y = _dlp.from_dlpack_capsule(cap) + assert x4._pointer == y._pointer + del x4, y, x + del cap + + +def test_versioned_dlpack_capsule(): + try: + x = dpt.arange(100, dtype="i4") + except dpctl.SyclDeviceCreationError: + pytest.skip("No default device available") + + max_supported_ver = _dlp.get_build_dlpack_version() + cap = x.__dlpack__(max_version=max_supported_ver) + y = _dlp.from_dlpack_capsule(cap) + del cap + assert x._pointer == y._pointer + + x2 = dpt.asarray(dpt.reshape(x, (10, 10)), order="F") + cap = x2.__dlpack__(max_version=max_supported_ver) + y = _dlp.from_dlpack_capsule(cap) + del cap + assert x2._pointer == y._pointer + del x2 + + x3 = x[::-2] + cap = x3.__dlpack__(max_version=max_supported_ver) + y = _dlp.from_dlpack_capsule(cap) + assert x3._pointer == y._pointer + del x3, y, x + del cap + + # read-only array + x = dpt.arange(100, dtype="i4") + x.flags["W"] = False + cap = x.__dlpack__(max_version=max_supported_ver) + y = _dlp.from_dlpack_capsule(cap) + assert x._pointer == y._pointer + assert not y.flags.writable + + # read-only array, and copy + cap = x.__dlpack__(max_version=max_supported_ver, copy=True) + y = _dlp.from_dlpack_capsule(cap) + assert x._pointer != y._pointer + assert not y.flags.writable + + +def test_from_dlpack_kwargs(): + try: + x = dpt.arange(100, dtype="i4") + except dpctl.SyclDeviceCreationError: + pytest.skip("No default device available") + + y = dpt.from_dlpack(x, copy=True) + assert x._pointer != y._pointer + + z = dpt.from_dlpack(x, device=x.sycl_device) + assert z._pointer == x._pointer + + +def test_dlpack_deleters(): + try: + x = dpt.arange(100, dtype="i4") + except dpctl.SyclDeviceCreationError: + pytest.skip("No default device available") + + legacy_ver = (0, 8) + cap = x.__dlpack__(max_version=legacy_ver) + del cap + + max_supported_ver = _dlp.get_build_dlpack_version() + cap = x.__dlpack__(max_version=max_supported_ver) + del cap + + +def test_from_dlpack_device(): + try: + x = dpt.arange(100, dtype="i4") + except dpctl.SyclDeviceCreationError: + pytest.skip("No default device available") + + out = dpt.from_dlpack(x, device=x.__dlpack_device__()) + assert x.device == out.device + assert x._pointer == out._pointer + + out = dpt.from_dlpack(x, device=x.device) + assert x.device == out.device + assert x._pointer == out._pointer + + out = dpt.from_dlpack(x, device=x.sycl_device) + assert x.device == out.device + assert x._pointer == out._pointer + + +def test_used_dlpack_capsule(): + try: + x = dpt.arange(100, dtype="i4") + except dpctl.SyclDeviceCreationError: + pytest.skip("No default device available") + + legacy_ver = (0, 8) + cap = x.__dlpack__(max_version=legacy_ver) + _dlp.from_dlpack_capsule(cap) + with pytest.raises( + ValueError, + match="A DLPack tensor object can not be consumed multiple times", + ): + _dlp.from_dlpack_capsule(cap) + del cap + + max_supported_ver = _dlp.get_build_dlpack_version() + cap = x.__dlpack__(max_version=max_supported_ver) + _dlp.from_dlpack_capsule(cap) + with pytest.raises( + ValueError, + match="A DLPack tensor object can not be consumed multiple times", + ): + _dlp.from_dlpack_capsule(cap) + del cap + + +def test_dlpack_size_0(): + try: + x = dpt.ones(0, dtype="i4") + except dpctl.SyclDeviceCreationError: + pytest.skip("No default device available") + + legacy_ver = (0, 8) + cap = x.__dlpack__(max_version=legacy_ver) + y = _dlp.from_dlpack_capsule(cap) + assert y._pointer == x._pointer + + max_supported_ver = _dlp.get_build_dlpack_version() + cap = x.__dlpack__(max_version=max_supported_ver) + y = _dlp.from_dlpack_capsule(cap) + assert y._pointer == x._pointer + + +def test_dlpack_max_version_validation(): + try: + x = dpt.ones(100, dtype="i4") + except dpctl.SyclDeviceCreationError: + pytest.skip("No default device available") + + with pytest.raises( + TypeError, + match=r"`__dlpack__` expects `max_version` to be a " + r"2-tuple of integers `\(major, minor\)`, instead " + r"got .*", + ): + x.__dlpack__(max_version=1) + + +def test_dlpack_kwargs(): + try: + q1 = dpctl.SyclQueue() + q2 = dpctl.SyclQueue() + except dpctl.SyclQueueCreationError: + pytest.skip("Could not create default queues") + x = dpt.arange(100, dtype="i4", sycl_queue=q1) + + legacy_ver = (0, 8) + cap = x.__dlpack__(stream=q2, max_version=legacy_ver, copy=True) + # `copy` ignored for legacy path + y = _dlp.from_dlpack_capsule(cap) + assert y._pointer == x._pointer + del x, y + del cap + + x1 = dpt.arange(100, dtype="i4", sycl_queue=q1) + max_supported_ver = _dlp.get_build_dlpack_version() + cap = x1.__dlpack__(stream=q2, max_version=max_supported_ver, copy=False) + y = _dlp.from_dlpack_capsule(cap) + assert y._pointer == x1._pointer + del x1, y + del cap + + x2 = dpt.arange(100, dtype="i4", sycl_queue=q1) + cap = x2.__dlpack__(stream=q2, max_version=max_supported_ver, copy=True) + y = _dlp.from_dlpack_capsule(cap) + assert y._pointer != x2._pointer + del x2, y + del cap + + +def _is_capsule(o): + t = type(o) + return t.__module__ == "builtins" and t.__name__ == "PyCapsule" + + +def test_dlpack_dl_device(): + try: + x = dpt.arange(100, dtype="i4") + except dpctl.SyclDeviceCreationError: + pytest.skip("No SYCL devices available") + max_supported_ver = _dlp.get_build_dlpack_version() + cap1 = x.__dlpack__( + dl_device=x.__dlpack_device__(), max_version=max_supported_ver + ) + assert _is_capsule(cap1) + cap2 = x.__dlpack__(dl_device=(1, 0), max_version=max_supported_ver) + assert _is_capsule(cap2) + cap3 = x.__dlpack__( + dl_device=(device_CPU, 0), + max_version=max_supported_ver, + ) + assert _is_capsule(cap3) + cap4 = x.__dlpack__(dl_device=("kDLCPU", 0), max_version=max_supported_ver) + assert _is_capsule(cap4) + with pytest.raises(TypeError): + # pass method instead of return of its __call__ invocation + x.__dlpack__( + dl_device=x.__dlpack_device__, max_version=max_supported_ver + ) + with pytest.raises(TypeError): + # exercise check for length + x.__dlpack__(dl_device=(3,), max_version=max_supported_ver) + + +def test_from_dlpack_kdlcpu_interop_numpy(): + """ + Basic test that usm_ndarray can interoperate with NumPy ndarray + `__dlpack_device__`. + """ + get_queue_or_skip() + + sh = 5 + dt = dpt.int32 + + X = dpt.empty(sh, dtype=dt) + dl_device_np = np.empty(()).__dlpack_device__() + + Y = dpt.from_dlpack(X, device=dl_device_np) + assert isinstance(Y, np.ndarray) + assert X.shape == Y.shape + assert X.dtype == Y.dtype + + V = dpt.from_dlpack(Y) + assert isinstance(V, np.ndarray) + assert Y.shape == V.shape + assert Y.dtype == V.dtype + + +@pytest.mark.parametrize("shape", [tuple(), (2,), (3, 0, 1), (2, 2, 2)]) +def test_from_dlpack_to_kdlcpu(shape, typestr): + q = get_queue_or_skip() + skip_if_dtype_not_supported(typestr, q.sycl_device) + + X = dpt.empty(shape, dtype=typestr, sycl_queue=q) + Y = dpt.from_dlpack(X, device=(device_CPU, 0)) + assert isinstance(Y, np.ndarray) + assert X.shape == Y.shape + assert X.dtype == Y.dtype + # NumPy does not treat size 0 arrays consistently + # w.r.t. strides, so skip these cases + if X.ndim and X.size != 0: + V = Y[::-1] + W = dpt.from_dlpack(V) + assert V.strides == W.strides + + +@pytest.mark.parametrize("mod", [2, 5]) +def test_from_dlpack_to_kdlcpu_strides(mod, typestr): + q = get_queue_or_skip() + skip_if_dtype_not_supported(typestr, q.sycl_device) + + X0 = dpt.empty(3 * mod, dtype=typestr, sycl_queue=q) + for start in range(mod): + X = X0[slice(-start - 1, None, -mod)] + Y = dpt.from_dlpack(X, device=(device_CPU, 0)) + assert X.shape == Y.shape + assert X.dtype == Y.dtype + if Y.ndim: + V = Y[::-1] + W = dpt.from_dlpack(V) + assert V.strides == W.strides + + +def test_dlpack_from_subdevice_to_kdlcpu(): + """ + Check that array allocated on a sub-device can be + imported via DLPack to kDLCPU device (as a NumPy array). + """ + n = 64 + try: + dev = dpctl.SyclDevice() + except dpctl.SyclDeviceCreationError: + pytest.skip("No default device available") + try: + sdevs = dev.create_sub_devices(partition="next_partitionable") + except dpctl.SyclSubDeviceCreationError: + sdevs = None + try: + if sdevs is None: + sdevs = dev.create_sub_devices(partition=[1, 1]) + except dpctl.SyclSubDeviceCreationError: + pytest.skip("Default device can not be partitioned") + assert isinstance(sdevs, list) and len(sdevs) > 0 + try: + ctx = sdevs[0].sycl_platform.default_context + except dpctl.SyclContextCreationError: + pytest.skip("Platform's default_context is not available") + try: + q = dpctl.SyclQueue(ctx, sdevs[0]) + except dpctl.SyclQueueCreationError: + pytest.skip("Queue could not be created") + + ar = dpt.arange(n, dtype=dpt.int32, sycl_queue=q) + ar2 = dpt.from_dlpack(ar, dl_device=(device_CPU, 0)) + assert isinstance(ar2, np.ndarray) + + +def test_legacy_dlpack_capsule_from_numpy(): + """ + Check that NumPy's exported legacy DLPack capsule + will interoperate with from_dlpack_capsule, + especially with zero-copy. + """ + x = np.arange(100, dtype="i4") + cap = x.__dlpack__() + y = _dlp.from_dlpack_capsule(cap) + del cap + assert x.ctypes.data == y.ctypes.data + + x = np.arange(100, dtype="u4").reshape((10, 10)).T + cap = x.__dlpack__() + y = _dlp.from_dlpack_capsule(cap) + del cap + assert x.ctypes.data == y.ctypes.data + del x + + x = np.arange(100, dtype="f4").reshape((10, 10), order="F") + cap = x.__dlpack__() + y = _dlp.from_dlpack_capsule(cap) + del cap + assert x.ctypes.data == y.ctypes.data + + x = np.arange(100, dtype="c8") + x1 = x[::-2] + cap = x1.__dlpack__() + y = _dlp.from_dlpack_capsule(cap) + assert x1.ctypes.data == y.ctypes.data + del x1, y, x + del cap + + x = np.ones(100, dtype="?") + x1 = x[::-2] + cap = x1.__dlpack__() + y = _dlp.from_dlpack_capsule(cap) + assert x1.ctypes.data == y.ctypes.data + del x1, y, x + del cap + + +def test_dlpack_capsule_readonly_array_to_kdlcpu(): + try: + x = dpt.arange(100, dtype="i4") + except dpctl.SyclDeviceCreationError: + pytest.skip("No default device available") + + max_supported_ver = _dlp.get_build_dlpack_version() + # read-only array + x.flags["W"] = False + cap = x.__dlpack__(max_version=max_supported_ver, dl_device=(device_CPU, 0)) + y = _dlp.from_dlpack_capsule(cap) + assert dpt.all(x == dpt.asarray(y)) + assert not y.flags["W"] + + cap1 = _dlp.numpy_to_dlpack_versioned_capsule(y, not y.flags["W"]) + y1 = _dlp.from_dlpack_capsule(cap1) + assert not y1.flags["W"] + + +def test_to_dlpack_capsule_c_and_f_contig(): + try: + x = dpt.asarray(np.random.rand(2, 3)) + except dpctl.SyclDeviceCreationError: + pytest.skip("No default device available") + + cap = _dlp.to_dlpack_capsule(x) + y = _dlp.from_dlpack_capsule(cap) + assert np.allclose(dpt.asnumpy(x), dpt.asnumpy(y)) + assert x.strides == y.strides + + x_f = x.T + cap = _dlp.to_dlpack_capsule(x_f) + yf = _dlp.from_dlpack_capsule(cap) + assert np.allclose(dpt.asnumpy(x_f), dpt.asnumpy(yf)) + assert x_f.strides == yf.strides + del cap + + +def test_to_dlpack_versioned_capsule_c_and_f_contig(): + try: + x = dpt.asarray(np.random.rand(2, 3)) + max_supported_ver = _dlp.get_build_dlpack_version() + except dpctl.SyclDeviceCreationError: + pytest.skip("No default device available") + + cap = x.__dlpack__(max_version=max_supported_ver) + y = _dlp.from_dlpack_capsule(cap) + assert np.allclose(dpt.asnumpy(x), dpt.asnumpy(y)) + assert x.strides == y.strides + + x_f = x.T + cap = x_f.__dlpack__(max_version=max_supported_ver) + yf = _dlp.from_dlpack_capsule(cap) + assert np.allclose(dpt.asnumpy(x_f), dpt.asnumpy(yf)) + assert x_f.strides == yf.strides + del cap + + +def test_used_dlpack_capsule_from_numpy(): + get_queue_or_skip() + + x_np = np.arange(100, dtype="i4") + + cap = x_np.__dlpack__() + _dlp.from_dlpack_capsule(cap) + with pytest.raises( + ValueError, + match="A DLPack tensor object can not be consumed multiple times", + ): + _dlp.from_dlpack_capsule(cap) + del cap + + x = dpt.asarray(x_np) + max_supported_ver = _dlp.get_build_dlpack_version() + cap = x.__dlpack__(max_version=max_supported_ver, dl_device=(device_CPU, 0)) + _dlp.from_dlpack_capsule(cap) + with pytest.raises( + ValueError, + match="A DLPack tensor object can not be consumed multiple times", + ): + _dlp.from_dlpack_capsule(cap) + del cap + + +def test_dlpack_size_0_on_kdlcpu(): + get_queue_or_skip() + x_np = np.ones(0, dtype="i4") + + cap = x_np.__dlpack__() + y = _dlp.from_dlpack_capsule(cap) + assert y.ctypes.data == x_np.ctypes.data + + +def test_copy_via_host(): + get_queue_or_skip() + x = dpt.ones(1, dtype="i4") + x_np = np.ones(1, dtype="i4") + x_dl_dev = x.__dlpack_device__() + y = dpt.from_dlpack(x_np, device=x_dl_dev) + assert isinstance(y, dpt.usm_ndarray) + assert y.sycl_device == x.sycl_device + assert y.usm_type == "device" + + with pytest.raises(ValueError): + # incorrect length of tuple + dpt.from_dlpack(x_np, device=(1, 0, 0)) + with pytest.raises(ValueError): + # only kDLCPU and kDLOneAPI are supported + dpt.from_dlpack(x, device=(2, 0)) + + num_devs = dpctl.get_num_devices() + if num_devs > 1: + j = [i for i in range(num_devs) if i != x_dl_dev[1]][0] + z = dpt.from_dlpack(x, device=(x_dl_dev[0], j)) + assert isinstance(z, dpt.usm_ndarray) + assert z.usm_type == "device" + + +def test_copy_via_host_gh_1789(): + "Test based on review example from gh-1789" + get_queue_or_skip() + x_np = np.ones((10, 10), dtype="i4") + # strides are no longer multiple of itemsize + x_np = np.lib.stride_tricks.as_strided( + x_np, shape=x_np.shape, strides=(x_np.strides[0] - 1, x_np.strides[1]) + ) + with pytest.raises(BufferError): + dpt.from_dlpack(x_np) + with pytest.raises(BufferError): + dpt.from_dlpack(x_np, device=(14, 0)) + + +class LegacyContainer: + "Helper class implementing legacy `__dlpack__` protocol" + + def __init__(self, array): + self._array = array + + def __dlpack__(self, stream=None): + return self._array.__dlpack__(stream=stream) + + def __dlpack_device__(self): + return self._array.__dlpack_device__() + + +class Container: + "Helper class implementing `__dlpack__` protocol version 1.0" + + def __init__(self, array): + self._array = array + + def __dlpack__( + self, max_version=None, dl_device=None, copy=None, stream=None + ): + return self._array.__dlpack__( + max_version=max_version, + dl_device=dl_device, + copy=copy, + stream=stream, + ) + + def __dlpack_device__(self): + return self._array.__dlpack_device__() + + +def test_generic_container_legacy(): + get_queue_or_skip() + C = LegacyContainer(dpt.linspace(0, 100, num=20, dtype="int16")) + + X = dpt.from_dlpack(C) + assert isinstance(X, dpt.usm_ndarray) + assert X._pointer == C._array._pointer + assert X.sycl_device == C._array.sycl_device + assert X.dtype == C._array.dtype + + Y = dpt.from_dlpack(C, device=(dpt.DLDeviceType.kDLCPU, 0)) + assert isinstance(Y, np.ndarray) + assert Y.dtype == X.dtype + + Z = dpt.from_dlpack(C, device=X.device) + assert isinstance(Z, dpt.usm_ndarray) + assert Z._pointer == X._pointer + assert Z.device == X.device + + +def test_generic_container_legacy_np(): + get_queue_or_skip() + C = LegacyContainer(np.linspace(0, 100, num=20, dtype="int16")) + + X = dpt.from_dlpack(C) + assert isinstance(X, np.ndarray) + assert X.ctypes.data == C._array.ctypes.data + assert X.dtype == C._array.dtype + + Y = dpt.from_dlpack(C, device=(dpt.DLDeviceType.kDLCPU, 0)) + assert isinstance(Y, np.ndarray) + assert Y.dtype == X.dtype + + dev = dpt.Device.create_device() + Z = dpt.from_dlpack(C, device=dev) + assert isinstance(Z, dpt.usm_ndarray) + assert Z.device == dev + + +def test_generic_container(): + get_queue_or_skip() + C = Container(dpt.linspace(0, 100, num=20, dtype="int16")) + + X = dpt.from_dlpack(C) + assert isinstance(X, dpt.usm_ndarray) + assert X._pointer == C._array._pointer + assert X.sycl_device == C._array.sycl_device + assert X.dtype == C._array.dtype + + Y = dpt.from_dlpack(C, device=(dpt.DLDeviceType.kDLCPU, 0)) + assert isinstance(Y, np.ndarray) + assert Y.dtype == X.dtype + + Z = dpt.from_dlpack(C, device=X.device) + assert isinstance(Z, dpt.usm_ndarray) + assert Z._pointer == X._pointer + assert Z.device == X.device + + +def test_sycl_device_to_dldevice(all_root_devices): + for sycl_dev in all_root_devices: + dev = dpt.sycl_device_to_dldevice(sycl_dev) + assert type(dev) is tuple + assert len(dev) == 2 + assert dev[0] == device_oneAPI + assert dev[1] == sycl_dev.get_device_id() + + +def test_dldevice_to_sycl_device(all_root_devices): + for sycl_dev in all_root_devices: + dldev = dpt.empty(0, device=sycl_dev).__dlpack_device__() + dev = dpt.dldevice_to_sycl_device(dldev) + assert type(dev) is dpctl.SyclDevice + assert dev.get_device_id() == sycl_dev.get_device_id() + + +def test_dldevice_conversion_arg_validation(): + bad_dldevice_type = (dpt.DLDeviceType.kDLCPU, 0) + with pytest.raises(ValueError): + dpt.dldevice_to_sycl_device(bad_dldevice_type) + + bad_dldevice_len = bad_dldevice_type + (0,) + with pytest.raises(ValueError): + dpt.dldevice_to_sycl_device(bad_dldevice_len) + + bad_dldevice = {} + with pytest.raises(TypeError): + dpt.dldevice_to_sycl_device(bad_dldevice) + + bad_sycldevice = {} + with pytest.raises(TypeError): + dpt.sycl_device_to_dldevice(bad_sycldevice) diff --git a/dpnp/tests/tensor/test_usm_ndarray_indexing.py b/dpnp/tests/tensor/test_usm_ndarray_indexing.py new file mode 100644 index 000000000000..b81e5456872b --- /dev/null +++ b/dpnp/tests/tensor/test_usm_ndarray_indexing.py @@ -0,0 +1,2054 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import dpctl +import numpy as np +import pytest +from numpy.testing import assert_array_equal + +import dpnp.tensor as dpt +import dpnp.tensor._tensor_impl as ti +from dpnp.tensor._copy_utils import _take_multi_index + +from .helper import ( + get_queue_or_skip, + skip_if_dtype_not_supported, +) + +_all_dtypes = [ + "u1", + "i1", + "u2", + "i2", + "u4", + "i4", + "u8", + "i8", + "e", + "f", + "d", + "F", + "D", +] + +_all_int_dtypes = ["u1", "i1", "u2", "i2", "u4", "i4", "u8", "i8"] + + +def test_basic_slice1(): + q = get_queue_or_skip() + x = dpt.empty(10, dtype="u2", sycl_queue=q) + y = x[0] + assert isinstance(y, dpt.usm_ndarray) + assert y.ndim == 0 + assert y.shape == () + assert y.strides == () + + +def test_basic_slice2(): + q = get_queue_or_skip() + x = dpt.empty(10, dtype="i2", sycl_queue=q) + y = x[(0,)] + assert isinstance(y, dpt.usm_ndarray) + assert y.ndim == 0 + assert y.shape == () + assert y.strides == () + + +def test_basic_slice3(): + q = get_queue_or_skip() + x = dpt.empty(10, dtype="i2", sycl_queue=q) + y = x[:] + assert isinstance(y, dpt.usm_ndarray) + assert y.ndim == x.ndim + assert y.shape == x.shape + assert y.strides == x.strides + y = x[(slice(None, None, None),)] + assert isinstance(y, dpt.usm_ndarray) + assert y.ndim == x.ndim + assert y.shape == x.shape + assert y.strides == x.strides + + +def test_basic_slice4(): + q = get_queue_or_skip() + n0, n1 = 5, 3 + x = dpt.empty((n0, n1), dtype="f4", sycl_queue=q) + y = x[::-1] + assert isinstance(y, dpt.usm_ndarray) + assert y.shape == x.shape + assert y.strides == (-x.strides[0], x.strides[1]) + actual_offset = y.__sycl_usm_array_interface__["offset"] + assert actual_offset == (n0 - 1) * n1 + + +def test_basic_slice5(): + q = get_queue_or_skip() + n0, n1 = 5, 3 + x = dpt.empty((n0, n1), dtype="c8", sycl_queue=q) + y = x[:, ::-1] + assert isinstance(y, dpt.usm_ndarray) + assert y.shape == x.shape + assert y.strides == (x.strides[0], -x.strides[1]) + actual_offset = y.__sycl_usm_array_interface__["offset"] + assert actual_offset == (n1 - 1) + + +def test_basic_slice6(): + q = get_queue_or_skip() + i0, n0, n1 = 2, 4, 3 + x = dpt.empty((n0, n1), dtype="c8", sycl_queue=q) + y = x[i0, ::-1] + assert isinstance(y, dpt.usm_ndarray) + assert y.shape == (x.shape[1],) + assert y.strides == (-x.strides[1],) + actual_offset = y.__sycl_usm_array_interface__["offset"] + expected_offset = i0 * x.strides[0] + (n1 - 1) * x.strides[1] + assert actual_offset == expected_offset + + +def test_basic_slice7(): + q = get_queue_or_skip() + n0, n1, n2 = 5, 3, 2 + x = dpt.empty((n0, n1, n2), dtype="?", sycl_queue=q) + y = x[..., ::-1] + assert isinstance(y, dpt.usm_ndarray) + assert y.shape == x.shape + assert y.strides == ( + x.strides[0], + x.strides[1], + -x.strides[2], + ) + actual_offset = y.__sycl_usm_array_interface__["offset"] + expected_offset = (n2 - 1) * x.strides[2] + assert actual_offset == expected_offset + + +def test_basic_slice8(): + q = get_queue_or_skip() + n0, n1 = 3, 7 + x = dpt.empty((n0, n1), dtype="u1", sycl_queue=q) + y = x[..., dpt.newaxis] + assert isinstance(y, dpt.usm_ndarray) + assert y.shape == (n0, n1, 1) + assert y.strides == (n1, 1, 0) + + +def test_basic_slice9(): + q = get_queue_or_skip() + n0, n1 = 3, 7 + x = dpt.empty((n0, n1), dtype="u8", sycl_queue=q) + y = x[dpt.newaxis, ...] + assert isinstance(y, dpt.usm_ndarray) + assert y.shape == (1, n0, n1) + assert y.strides == (0, n1, 1) + + +def test_basic_slice10(): + q = get_queue_or_skip() + n0, n1, n2 = 3, 7, 5 + x = dpt.empty((n0, n1, n2), dtype="u1", sycl_queue=q) + y = x[dpt.newaxis, ..., :] + assert isinstance(y, dpt.usm_ndarray) + assert y.shape == (1, n0, n1, n2) + assert y.strides == (0, n1 * n2, n2, 1) + + +def _all_equal(it1, it2): + return all(bool(x == y) for x, y in zip(it1, it2)) + + +def test_advanced_slice1(): + q = get_queue_or_skip() + ii = dpt.asarray([1, 2], sycl_queue=q) + x = dpt.arange(10, dtype="i4", sycl_queue=q) + y = x[ii] + assert isinstance(y, dpt.usm_ndarray) + assert y.shape == ii.shape + assert y.strides == (1,) + assert _all_equal( + (x[ii[k]] for k in range(ii.shape[0])), + (y[k] for k in range(ii.shape[0])), + ) + y = x[(ii,)] + assert isinstance(y, dpt.usm_ndarray) + assert y.shape == ii.shape + assert y.strides == (1,) + assert _all_equal( + (x[ii[k]] for k in range(ii.shape[0])), + (y[k] for k in range(ii.shape[0])), + ) + + +def test_advanced_slice1_negative_strides(): + q = get_queue_or_skip() + ii = dpt.asarray([0, 1], sycl_queue=q) + x = dpt.flip(dpt.arange(5, dtype="i4", sycl_queue=q)) + y = x[ii] + assert isinstance(y, dpt.usm_ndarray) + assert y.shape == ii.shape + assert y.strides == (1,) + assert _all_equal( + (x[ii[k]] for k in range(ii.shape[0])), + (y[k] for k in range(ii.shape[0])), + ) + + +def test_advanced_slice2(): + q = get_queue_or_skip() + ii = dpt.asarray([1, 2], sycl_queue=q) + x = dpt.arange(10, dtype="i4", sycl_queue=q) + y = x[ii, dpt.newaxis] + assert isinstance(y, dpt.usm_ndarray) + assert y.shape == ii.shape + (1,) + assert y.flags["C"] + + +def test_advanced_slice3(): + q = get_queue_or_skip() + ii = dpt.asarray([1, 2], sycl_queue=q) + x = dpt.arange(10, dtype="i4", sycl_queue=q) + y = x[dpt.newaxis, ii] + assert isinstance(y, dpt.usm_ndarray) + assert y.shape == (1,) + ii.shape + assert y.flags["C"] + + +def _make_3d(dt, q): + return dpt.reshape( + dpt.arange(3 * 3 * 3, dtype=dt, sycl_queue=q), + ( + 3, + 3, + 3, + ), + ) + + +def test_advanced_slice4(): + q = get_queue_or_skip() + ii = dpt.asarray([1, 2], sycl_queue=q) + x = _make_3d("i4", q) + y = x[ii, ii, ii] + assert isinstance(y, dpt.usm_ndarray) + assert y.shape == ii.shape + assert _all_equal( + (x[ii[k], ii[k], ii[k]] for k in range(ii.shape[0])), + (y[k] for k in range(ii.shape[0])), + ) + + +def test_advanced_slice5(): + q = get_queue_or_skip() + ii = dpt.asarray([1, 2], sycl_queue=q) + x = _make_3d("i4", q) + y = x[ii, 0, ii] + assert isinstance(y, dpt.usm_ndarray) + # 0 broadcast to [0, 0] per array API + assert y.shape == ii.shape + assert _all_equal( + (x[ii[i], 0, ii[i]] for i in range(ii.shape[0])), + (y[i] for i in range(ii.shape[0])), + ) + + +def test_advanced_slice6(): + q = get_queue_or_skip() + ii = dpt.asarray([1, 2], sycl_queue=q) + x = _make_3d("i4", q) + y = x[:, ii, ii] + assert isinstance(y, dpt.usm_ndarray) + assert y.shape == ( + x.shape[0], + ii.shape[0], + ) + assert _all_equal( + ( + x[i, ii[k], ii[k]] + for i in range(x.shape[0]) + for k in range(ii.shape[0]) + ), + (y[i, k] for i in range(x.shape[0]) for k in range(ii.shape[0])), + ) + + +def test_advanced_slice7(): + q = get_queue_or_skip() + mask = dpt.asarray( + [ + [[True, True, False], [False, True, True], [True, False, True]], + [[True, False, False], [False, False, True], [False, True, False]], + [[True, True, True], [False, False, False], [False, False, True]], + ], + sycl_queue=q, + ) + x = _make_3d("i2", q) + y = x[mask] + expected = [0, 1, 4, 5, 6, 8, 9, 14, 16, 18, 19, 20, 26] + assert isinstance(y, dpt.usm_ndarray) + assert y.shape == (len(expected),) + assert all(dpt.asnumpy(y[k]) == expected[k] for k in range(len(expected))) + + +def test_advanced_slice8(): + q = get_queue_or_skip() + mask = dpt.asarray( + [[True, False, False], [False, True, False], [False, True, False]], + sycl_queue=q, + ) + x = _make_3d("u2", q) + y = x[mask] + expected = dpt.asarray( + [[0, 1, 2], [12, 13, 14], [21, 22, 23]], sycl_queue=q + ) + assert isinstance(y, dpt.usm_ndarray) + assert y.shape == expected.shape + assert (dpt.asnumpy(y) == dpt.asnumpy(expected)).all() + + +def test_advanced_slice9(): + q = get_queue_or_skip() + mask = dpt.asarray( + [[True, False, False], [False, True, False], [False, True, False]], + sycl_queue=q, + ) + x = _make_3d("u4", q) + y = x[:, mask] + expected = dpt.asarray([[0, 4, 7], [9, 13, 16], [18, 22, 25]], sycl_queue=q) + assert isinstance(y, dpt.usm_ndarray) + assert y.shape == expected.shape + assert (dpt.asnumpy(y) == dpt.asnumpy(expected)).all() + + +def lin_id(i, j, k): + """global_linear_id for (3,3,3) range traversed in C-contiguous order""" + return 9 * i + 3 * j + k + + +def test_advanced_slice10(): + q = get_queue_or_skip() + x = _make_3d("u8", q) + i0 = dpt.asarray([0, 1, 1], device=x.device) + i1 = dpt.asarray([1, 1, 2], device=x.device) + i2 = dpt.asarray([2, 0, 1], device=x.device) + y = x[i0, i1, i2] + res_expected = dpt.asarray( + [ + lin_id(0, 1, 2), + lin_id(1, 1, 0), + lin_id(1, 2, 1), + ], + sycl_queue=q, + ) + assert isinstance(y, dpt.usm_ndarray) + assert y.shape == res_expected.shape + assert (dpt.asnumpy(y) == dpt.asnumpy(res_expected)).all() + + +def test_advanced_slice11(): + q = get_queue_or_skip() + x = _make_3d("u8", q) + i0 = dpt.asarray([0, 1, 1], device=x.device) + i2 = dpt.asarray([2, 0, 1], device=x.device) + with pytest.raises(IndexError): + x[i0, :, i2] + + +def test_advanced_slice12(): + q = get_queue_or_skip() + x = _make_3d("u8", q) + i1 = dpt.asarray([1, 1, 2], device=x.device) + i2 = dpt.asarray([2, 0, 1], device=x.device) + y = x[:, dpt.newaxis, i1, i2, dpt.newaxis] + res_expected = dpt.asarray( + [ + [[[lin_id(0, 1, 2)], [lin_id(0, 1, 0)], [lin_id(0, 2, 1)]]], + [[[lin_id(1, 1, 2)], [lin_id(1, 1, 0)], [lin_id(1, 2, 1)]]], + [[[lin_id(2, 1, 2)], [lin_id(2, 1, 0)], [lin_id(2, 2, 1)]]], + ], + sycl_queue=q, + ) + assert isinstance(y, dpt.usm_ndarray) + assert y.shape == res_expected.shape + assert (dpt.asnumpy(y) == dpt.asnumpy(res_expected)).all() + + +def test_advanced_slice13(): + q = get_queue_or_skip() + x = _make_3d("u8", q) + i1 = dpt.asarray([[1], [2]], device=x.device) + i2 = dpt.asarray([[0, 1]], device=x.device) + y = x[i1, i2, 0] + expected = dpt.asarray( + [ + [lin_id(1, 0, 0), lin_id(1, 1, 0)], + [lin_id(2, 0, 0), lin_id(2, 1, 0)], + ], + device=x.device, + ) + assert isinstance(y, dpt.usm_ndarray) + assert y.shape == expected.shape + assert (dpt.asnumpy(y) == dpt.asnumpy(expected)).all() + + +def test_advanced_slice14(): + q = get_queue_or_skip() + ii = dpt.asarray([1, 2], sycl_queue=q) + x = dpt.reshape(dpt.arange(3**5, dtype="i4", sycl_queue=q), (3,) * 5) + y = x[ii, 0, ii, 1, :] + assert isinstance(y, dpt.usm_ndarray) + # integers broadcast to ii.shape per array API + assert y.shape == ii.shape + x.shape[-1:] + assert _all_equal( + ( + x[ii[i], 0, ii[i], 1, k] + for i in range(ii.shape[0]) + for k in range(x.shape[-1]) + ), + (y[i, k] for i in range(ii.shape[0]) for k in range(x.shape[-1])), + ) + + +def test_advanced_slice15(): + q = get_queue_or_skip() + ii = dpt.asarray([1, 2], sycl_queue=q) + x = dpt.reshape(dpt.arange(3**5, dtype="i4", sycl_queue=q), (3,) * 5) + # : cannot appear between two integral arrays + with pytest.raises(IndexError): + x[ii, 0, ii, :, ii] + + +def test_advanced_slice16(): + q = get_queue_or_skip() + ii = dpt.asarray(1, sycl_queue=q) + i0 = dpt.asarray(False, sycl_queue=q) + i1 = dpt.asarray(True, sycl_queue=q) + x = dpt.reshape(dpt.arange(3**5, dtype="i4", sycl_queue=q), (3,) * 5) + y = x[ii, i0, ii, i1, :] + # TODO: add a shape check here when discrepancy with NumPy is investigated + assert isinstance(y, dpt.usm_ndarray) + + +def test_integer_indexing_numpy_array(): + q = get_queue_or_skip() + ii = np.asarray([1, 2]) + x = dpt.arange(10, dtype="i4", sycl_queue=q) + y = x[ii] + assert isinstance(y, dpt.usm_ndarray) + assert y.shape == ii.shape + assert dpt.all(x[1:3] == y) + + +def test_boolean_indexing_numpy_array(): + q = get_queue_or_skip() + ii = np.asarray( + [False, True, True, False, False, False, False, False, False, False] + ) + x = dpt.arange(10, dtype="i4", sycl_queue=q) + y = x[ii] + assert isinstance(y, dpt.usm_ndarray) + assert y.shape == (2,) + assert dpt.all(x[1:3] == y) + + +def test_boolean_indexing_validation(): + get_queue_or_skip() + x = dpt.zeros(10, dtype="i4") + ii = dpt.ones((2, 5), dtype="?") + with pytest.raises(IndexError): + x[ii] + with pytest.raises(IndexError): + x[ii[0, :]] + + +def test_boolean_indexing_getitem_empty_mask(): + get_queue_or_skip() + x = dpt.ones((2, 3, 4), dtype="i4") + ii = dpt.ones((0,), dtype="?") + assert x[ii].size == 0 + ii1 = dpt.ones((0, 3), dtype="?") + assert x[ii1].size == 0 + ii2 = dpt.ones((0, 3, 4), dtype="?") + assert x[ii2].size == 0 + + +def test_boolean_indexing_setitem_empty_mask(): + get_queue_or_skip() + x = dpt.ones((2, 3, 4), dtype="i4") + ii = dpt.ones((0,), dtype="?") + x[ii] = 0 + assert dpt.all(x == 1) + ii1 = dpt.ones((0, 3), dtype="?") + x[ii1] = 0 + assert dpt.all(x == 1) + ii2 = dpt.ones((0, 3, 4), dtype="?") + x[ii2] = 0 + assert dpt.all(x == 1) + + +def test_integer_indexing_1d(): + get_queue_or_skip() + x = dpt.arange(10, dtype="i4") + ind_1d = dpt.asarray([7, 3, 1], dtype="u2") + ind_2d = dpt.asarray([[2, 3, 4], [3, 4, 5], [5, 6, 7]], dtype="i4") + + y1 = x[ind_1d] + assert y1.shape == ind_1d.shape + y2 = x[ind_2d] + assert y2.shape == ind_2d.shape + assert (dpt.asnumpy(y1) == np.array([7, 3, 1], dtype="i4")).all() + assert ( + dpt.asnumpy(y2) + == np.array([[2, 3, 4], [3, 4, 5], [5, 6, 7]], dtype="i4") + ).all() + + +def test_integer_indexing_2d(): + get_queue_or_skip() + n0, n1 = 5, 7 + x = dpt.reshape( + dpt.arange(n0 * n1, dtype="i4"), + ( + n0, + n1, + ), + ) + ind0 = dpt.arange(n0) + ind1 = dpt.arange(n1) + + y = x[ind0[:2, dpt.newaxis], ind1[dpt.newaxis, -2:]] + assert y.dtype == x.dtype + assert (dpt.asnumpy(y) == np.array([[5, 6], [12, 13]])).all() + + +def test_integer_strided_indexing(): + get_queue_or_skip() + n0, n1 = 5, 7 + x = dpt.reshape( + dpt.arange(2 * n0 * n1, dtype="i4"), + ( + 2 * n0, + n1, + ), + ) + ind0 = dpt.arange(n0) + ind1 = dpt.arange(n1) + + z = x[::-2, :] + y = z[ind0[:2, dpt.newaxis], ind1[dpt.newaxis, -2:]] + assert y.dtype == x.dtype + zc = dpt.copy(z, order="C") + yc = zc[ind0[:2, dpt.newaxis], ind1[dpt.newaxis, -2:]] + assert (dpt.asnumpy(y) == dpt.asnumpy(yc)).all() + + +def test_TrueFalse_indexing(): + get_queue_or_skip() + n0, n1 = 2, 3 + x = dpt.ones((n0, n1)) + for ind in [True, dpt.asarray(True)]: + y1 = x[ind] + assert y1.shape == (1, n0, n1) + assert y1._pointer == x._pointer + y2 = x[:, ind] + assert y2.shape == (n0, 1, n1) + assert y2._pointer == x._pointer + y3 = x[..., ind] + assert y3.shape == (n0, n1, 1) + assert y3._pointer == x._pointer + for ind in [False, dpt.asarray(False)]: + y1 = x[ind] + assert y1.shape == (0, n0, n1) + assert y1._pointer == x._pointer + y2 = x[:, ind] + assert y2.shape == (n0, 0, n1) + assert y2._pointer == x._pointer + y3 = x[..., ind] + assert y3.shape == (n0, n1, 0) + assert y3._pointer == x._pointer + + +def test_mixed_index_getitem(): + get_queue_or_skip() + x = dpt.reshape(dpt.arange(1000, dtype="i4"), (10, 10, 10)) + i1b = dpt.ones(10, dtype="?") + info = x.__array_namespace__().__array_namespace_info__() + ind_dt = info.default_dtypes(device=x.device)["indexing"] + i0 = dpt.asarray([0, 2, 3], dtype=ind_dt)[:, dpt.newaxis] + i2 = dpt.asarray([3, 4, 7], dtype=ind_dt)[:, dpt.newaxis] + y = x[i0, i1b, i2] + assert y.shape == (3, dpt.sum(i1b, dtype="i8")) + + +def test_mixed_index_setitem(): + get_queue_or_skip() + x = dpt.reshape(dpt.arange(1000, dtype="i4"), (10, 10, 10)) + i1b = dpt.ones(10, dtype="?") + info = x.__array_namespace__().__array_namespace_info__() + ind_dt = info.default_dtypes(device=x.device)["indexing"] + i0 = dpt.asarray([0, 2, 3], dtype=ind_dt)[:, dpt.newaxis] + i2 = dpt.asarray([3, 4, 7], dtype=ind_dt)[:, dpt.newaxis] + v_shape = (3, int(dpt.sum(i1b, dtype="i8"))) + canary = 7 + x[i0, i1b, i2] = dpt.full(v_shape, canary, dtype=x.dtype) + assert x[0, 0, 3] == canary + + +@pytest.mark.parametrize( + "data_dt", + _all_dtypes, +) +@pytest.mark.parametrize( + "ind_dt", + _all_int_dtypes, +) +def test_take_basic(data_dt, ind_dt): + q = get_queue_or_skip() + skip_if_dtype_not_supported(data_dt, q) + + x = dpt.arange(10, dtype=data_dt) + ind = dpt.arange(2, 5, dtype=ind_dt) + y = dpt.take(x, ind) + assert y.dtype == x.dtype + assert (dpt.asnumpy(y) == np.arange(2, 5, dtype=data_dt)).all() + + +@pytest.mark.parametrize( + "data_dt", + _all_dtypes, +) +@pytest.mark.parametrize( + "ind_dt", + _all_int_dtypes, +) +def test_put_basic(data_dt, ind_dt): + q = get_queue_or_skip() + skip_if_dtype_not_supported(data_dt, q) + + x = dpt.arange(10, dtype=data_dt) + ind = dpt.arange(2, 5, dtype=ind_dt) + val = dpt.ones(3, dtype=data_dt) + dpt.put(x, ind, val) + assert ( + dpt.asnumpy(x) + == np.array([0, 1, 1, 1, 1, 5, 6, 7, 8, 9], dtype=data_dt) + ).all() + + +def test_take_basic_axis(): + get_queue_or_skip() + + n0, n1 = 5, 7 + x = dpt.reshape( + dpt.arange(n0 * n1, dtype="i4"), + ( + n0, + n1, + ), + ) + ind = dpt.arange(2, 4) + y0 = dpt.take(x, ind, axis=0) + y1 = dpt.take(x, ind, axis=1) + assert y0.shape == (2, n1) + assert y1.shape == (n0, 2) + + +def test_put_basic_axis(): + get_queue_or_skip() + + n0, n1 = 5, 7 + x = dpt.reshape( + dpt.arange(n0 * n1, dtype="i4"), + ( + n0, + n1, + ), + ) + ind = dpt.arange(2, 4) + v0 = dpt.zeros((2, n1), dtype=x.dtype) + v1 = dpt.zeros((n0, 2), dtype=x.dtype) + dpt.put(x, ind, v0, axis=0) + dpt.put(x, ind, v1, axis=1) + expected = np.arange(n0 * n1, dtype="i4").reshape((n0, n1)) + expected[[2, 3], :] = 0 + expected[:, [2, 3]] = 0 + assert (expected == dpt.asnumpy(x)).all() + + +@pytest.mark.parametrize("data_dt", _all_dtypes) +def test_put_0d_val(data_dt): + q = get_queue_or_skip() + skip_if_dtype_not_supported(data_dt, q) + + x = dpt.arange(5, dtype=data_dt, sycl_queue=q) + ind = dpt.asarray([0], dtype="i8", sycl_queue=q) + val = dpt.asarray(2, dtype=x.dtype, sycl_queue=q) + x[ind] = val + assert_array_equal(np.asarray(2, dtype=data_dt), dpt.asnumpy(x[0])) + + x = dpt.asarray(5, dtype=data_dt, sycl_queue=q) + dpt.put(x, ind, val) + assert_array_equal(np.asarray(2, dtype=data_dt), dpt.asnumpy(x)) + + +@pytest.mark.parametrize( + "data_dt", + _all_dtypes, +) +def test_take_0d_data(data_dt): + q = get_queue_or_skip() + skip_if_dtype_not_supported(data_dt, q) + + x = dpt.asarray(0, dtype=data_dt, sycl_queue=q) + ind = dpt.arange(5, dtype="i8", sycl_queue=q) + + y = dpt.take(x, ind) + assert ( + dpt.asnumpy(y) + == np.broadcast_to(np.asarray(0, dtype=data_dt), ind.shape) + ).all() + + +@pytest.mark.parametrize( + "data_dt", + _all_dtypes, +) +def test_put_0d_data(data_dt): + q = get_queue_or_skip() + skip_if_dtype_not_supported(data_dt, q) + + x = dpt.asarray(0, dtype=data_dt, sycl_queue=q) + ind = dpt.arange(5, dtype="i8", sycl_queue=q) + val = dpt.asarray(2, dtype=data_dt, sycl_queue=q) + + dpt.put(x, ind, val, axis=0) + assert ( + dpt.asnumpy(x) + == np.broadcast_to(np.asarray(2, dtype=data_dt), ind.shape) + ).all() + + +@pytest.mark.parametrize( + "ind_dt", + _all_int_dtypes, +) +def test_indexing_0d_ind(ind_dt): + q = get_queue_or_skip() + + x = dpt.arange(5, dtype="i4", sycl_queue=q) + ind = dpt.asarray(3, dtype=ind_dt, sycl_queue=q) + + y = x[ind] + assert dpt.asnumpy(x[3]) == dpt.asnumpy(y) + + +@pytest.mark.parametrize( + "ind_dt", + _all_int_dtypes, +) +def test_put_0d_ind(ind_dt): + q = get_queue_or_skip() + + x = dpt.arange(5, dtype="i4", sycl_queue=q) + ind = dpt.asarray(3, dtype=ind_dt, sycl_queue=q) + val = dpt.asarray(5, dtype=x.dtype, sycl_queue=q) + + x[ind] = val + assert dpt.asnumpy(x[3]) == dpt.asnumpy(val) + + +@pytest.mark.parametrize( + "data_dt", + _all_dtypes, +) +def test_take_strided_1d_source(data_dt): + q = get_queue_or_skip() + skip_if_dtype_not_supported(data_dt, q) + + x = dpt.arange(27, dtype=data_dt, sycl_queue=q) + ind = dpt.arange(4, 9, dtype="i8", sycl_queue=q) + + x_np = dpt.asnumpy(x) + ind_np = dpt.asnumpy(ind) + + for s in ( + slice(None, None, 2), + slice(None, None, -2), + ): + assert_array_equal( + np.take(x_np[s], ind_np, axis=0), + dpt.asnumpy(dpt.take(x[s], ind, axis=0)), + ) + + # 0-strided + x = dpt.usm_ndarray( + (27,), + dtype=data_dt, + strides=(0,), + buffer_ctor_kwargs={"queue": q}, + ) + x[0] = x_np[0] + assert_array_equal( + np.broadcast_to(x_np[0], ind.shape), + dpt.asnumpy(dpt.take(x, ind, axis=0)), + ) + + +@pytest.mark.parametrize( + "data_dt", + _all_dtypes, +) +@pytest.mark.parametrize("order", ["C", "F"]) +def test_take_strided(data_dt, order): + q = get_queue_or_skip() + skip_if_dtype_not_supported(data_dt, q) + + x = dpt.reshape(_make_3d(data_dt, q), (9, 3), order=order) + ind = dpt.arange(2, dtype="i8", sycl_queue=q) + + x_np = dpt.asnumpy(x) + ind_np = dpt.asnumpy(ind) + + for s in ( + slice(None, None, 2), + slice(None, None, -2), + ): + for sgn in (-1, 1): + xs = x[s, ::sgn] + xs_np = x_np[s, ::sgn] + assert_array_equal( + np.take(xs_np, ind_np, axis=0), + dpt.asnumpy(dpt.take(xs, ind, axis=0)), + ) + assert_array_equal( + np.take(xs_np, ind_np, axis=1), + dpt.asnumpy(dpt.take(xs, ind, axis=1)), + ) + + +@pytest.mark.parametrize( + "ind_dt", + _all_int_dtypes, +) +def test_take_strided_1d_indices(ind_dt): + q = get_queue_or_skip() + + x = dpt.arange(27, dtype="i4", sycl_queue=q) + ind = dpt.arange(12, 24, dtype=ind_dt, sycl_queue=q) + + x_np = dpt.asnumpy(x) + ind_np = dpt.asnumpy(ind).astype("i8") + + for s in ( + slice(None, None, 2), + slice(None, None, -2), + ): + assert_array_equal( + np.take(x_np, ind_np[s], axis=0), + dpt.asnumpy(dpt.take(x, ind[s], axis=0)), + ) + + # 0-strided + ind = dpt.usm_ndarray( + (12,), + dtype=ind_dt, + strides=(0,), + buffer_ctor_kwargs={"queue": q}, + ) + ind[0] = ind_np[0] + assert_array_equal( + np.broadcast_to(x_np[ind_np[0]], ind.shape), + dpt.asnumpy(dpt.take(x, ind, axis=0)), + ) + + +@pytest.mark.parametrize( + "ind_dt", + _all_int_dtypes, +) +@pytest.mark.parametrize("order", ["C", "F"]) +def test_take_strided_indices(ind_dt, order): + q = get_queue_or_skip() + + x = dpt.arange(27, dtype="i4", sycl_queue=q) + ind = dpt.reshape( + dpt.arange(12, 24, dtype=ind_dt, sycl_queue=q), (4, 3), order=order + ) + + x_np = dpt.asnumpy(x) + ind_np = dpt.asnumpy(ind).astype("i8") + + for s in ( + slice(None, None, 2), + slice(None, None, -2), + ): + for sgn in [-1, 1]: + inds = ind[s, ::sgn] + inds_np = ind_np[s, ::sgn] + assert_array_equal( + np.take(x_np, inds_np, axis=0), + dpt.asnumpy(x[inds]), + ) + + +@pytest.mark.parametrize( + "data_dt", + _all_dtypes, +) +@pytest.mark.parametrize("order", ["C", "F"]) +def test_put_strided_1d_destination(data_dt, order): + q = get_queue_or_skip() + skip_if_dtype_not_supported(data_dt, q) + + x = dpt.arange(27, dtype=data_dt, sycl_queue=q) + ind = dpt.arange(4, 9, dtype="i8", sycl_queue=q) + val = dpt.asarray(9, dtype=x.dtype, sycl_queue=q) + + x_np = dpt.asnumpy(x) + ind_np = dpt.asnumpy(ind) + val_np = dpt.asnumpy(val) + + for s in ( + slice(None, None, 2), + slice(None, None, -2), + ): + x_np1 = x_np.copy() + x_np1[s][ind_np] = val_np + + x1 = dpt.copy(x) + dpt.put(x1[s], ind, val, axis=0) + + assert_array_equal(x_np1, dpt.asnumpy(x1)) + + +@pytest.mark.parametrize( + "data_dt", + _all_dtypes, +) +@pytest.mark.parametrize("order", ["C", "F"]) +def test_put_strided_destination(data_dt, order): + q = get_queue_or_skip() + skip_if_dtype_not_supported(data_dt, q) + + x = dpt.reshape(_make_3d(data_dt, q), (9, 3), order=order) + ind = dpt.arange(2, dtype="i8", sycl_queue=q) + val = dpt.asarray(9, dtype=x.dtype, sycl_queue=q) + + x_np = dpt.asnumpy(x) + ind_np = dpt.asnumpy(ind) + val_np = dpt.asnumpy(val) + + for s in ( + slice(None, None, 2), + slice(None, None, -2), + ): + for sgn in [-1, 1]: + xs = x[s, ::sgn] + xs_np = x_np[s, ::sgn] + + x_np1 = xs_np.copy() + x_np1[ind_np] = val_np + + x1 = dpt.copy(xs) + dpt.put(x1, ind, val, axis=0) + assert_array_equal(x_np1, dpt.asnumpy(x1)) + + x_np1 = xs_np.copy() + x_np1[:, ind_np] = val_np + + x1 = dpt.copy(xs) + dpt.put(x1, ind, val, axis=1) + assert_array_equal(x_np1, dpt.asnumpy(x1)) + + x_np1 = xs_np.copy() + x_np1[ind_np, ind_np] = val_np + + x1 = dpt.copy(xs) + x1[ind, ind] = val + assert_array_equal(x_np1, dpt.asnumpy(x1)) + + +@pytest.mark.parametrize( + "ind_dt", + _all_int_dtypes, +) +def test_put_strided_1d_indices(ind_dt): + q = get_queue_or_skip() + + x = dpt.arange(27, dtype="i4", sycl_queue=q) + ind = dpt.arange(12, 24, dtype=ind_dt, sycl_queue=q) + val = dpt.asarray(-1, dtype=x.dtype, sycl_queue=q) + + x_np = dpt.asnumpy(x) + ind_np = dpt.asnumpy(ind).astype("i8") + val_np = dpt.asnumpy(val) + + for s in ( + slice(None, None, 2), + slice(None, None, -2), + ): + x_copy = dpt.copy(x) + dpt.put(x_copy, ind[s], val, axis=0) + + x_np_copy = x_np.copy() + x_np_copy[ind_np[s]] = val_np + + assert_array_equal(x_np_copy, dpt.asnumpy(x_copy)) + + +@pytest.mark.parametrize( + "ind_dt", + _all_int_dtypes, +) +@pytest.mark.parametrize("order", ["C", "F"]) +def test_put_strided_indices(ind_dt, order): + q = get_queue_or_skip() + + x = dpt.arange(27, dtype="i4", sycl_queue=q) + ind = dpt.reshape( + dpt.arange(12, 24, dtype=ind_dt, sycl_queue=q), (4, 3), order=order + ) + val = dpt.asarray(-1, sycl_queue=q, dtype=x.dtype) + + x_np = dpt.asnumpy(x) + ind_np = dpt.asnumpy(ind).astype("i8") + val_np = dpt.asnumpy(val) + + for s in ( + slice(None, None, 2), + slice(None, None, -2), + ): + for sgn in [-1, 1]: + inds = ind[s, ::sgn] + inds_np = ind_np[s, ::sgn] + + x_copy = dpt.copy(x) + x_copy[inds] = val + + x_np_copy = x_np.copy() + x_np_copy[inds_np] = val_np + + assert_array_equal(x_np_copy, dpt.asnumpy(x_copy)) + + +def test_integer_indexing_modes(): + q = get_queue_or_skip() + + x = dpt.arange(5, sycl_queue=q) + x_np = dpt.asnumpy(x) + + # wrapping negative indices + ind = dpt.asarray([-4, -3, 0, 2, 4], dtype="i8", sycl_queue=q) + + res = dpt.take(x, ind, mode="wrap") + expected_arr = np.take(x_np, dpt.asnumpy(ind), mode="raise") + + assert (dpt.asnumpy(res) == expected_arr).all() + + # clipping to 0 (disabling negative indices) + ind = dpt.asarray([-6, -3, 0, 2, 6], dtype="i8", sycl_queue=q) + + res = dpt.take(x, ind, mode="clip") + expected_arr = np.take(x_np, dpt.asnumpy(ind), mode="clip") + + assert (dpt.asnumpy(res) == expected_arr).all() + + +def test_take_arg_validation(): + q = get_queue_or_skip() + + x = dpt.arange(4, dtype="i4", sycl_queue=q) + ind0 = dpt.arange(4, dtype="i8", sycl_queue=q) + ind1 = dpt.arange(2.0, dtype="f", sycl_queue=q) + + with pytest.raises(TypeError): + dpt.take(dict(), ind0, axis=0) + with pytest.raises(TypeError): + dpt.take(x, dict(), axis=0) + with pytest.raises(IndexError): + x[[]] + with pytest.raises(IndexError): + dpt.take(x, ind1, axis=0) + with pytest.raises(IndexError): + x[ind1] + + with pytest.raises(ValueError): + dpt.take(dpt.reshape(x, (2, 2)), ind0) + with pytest.raises(ValueError): + dpt.take(x, ind0, mode=0) + with pytest.raises(ValueError): + dpt.take(dpt.reshape(x, (2, 2)), ind0, axis=None) + with pytest.raises(ValueError): + dpt.take(x, dpt.reshape(ind0, (2, 2))) + with pytest.raises(ValueError): + dpt.take(x[0], ind0, axis=2) + with pytest.raises(ValueError): + dpt.take(x[:, dpt.newaxis, dpt.newaxis], ind0, axis=None) + + +def test_put_arg_validation(): + q = get_queue_or_skip() + + x = dpt.arange(4, dtype="i4", sycl_queue=q) + ind0 = dpt.arange(4, dtype="i8", sycl_queue=q) + ind1 = dpt.arange(2.0, dtype="f", sycl_queue=q) + val = dpt.asarray(2, dtype=x.dtype, sycl_queue=q) + + with pytest.raises(TypeError): + dpt.put(dict(), ind0, val, axis=0) + with pytest.raises(TypeError): + dpt.put(x, dict(), val, axis=0) + with pytest.raises(IndexError): + x[[]] = val + with pytest.raises(IndexError): + dpt.put(x, ind1, val, axis=0) + with pytest.raises(IndexError): + x[ind1] = val + with pytest.raises(TypeError): + dpt.put(x, ind0, {}, axis=0) + with pytest.raises(TypeError): + x[ind0] = {} + + with pytest.raises(ValueError): + dpt.put(x, ind0, val, mode=0) + with pytest.raises(ValueError): + dpt.put(x, dpt.reshape(ind0, (2, 2)), val) + with pytest.raises(ValueError): + dpt.put(x[0], ind0, val, axis=2) + with pytest.raises(ValueError): + dpt.put(x[:, dpt.newaxis, dpt.newaxis], ind0, val, axis=None) + + +def test_advanced_indexing_compute_follows_data(): + q1 = get_queue_or_skip() + q2 = get_queue_or_skip() + + x = dpt.arange(4, sycl_queue=q1) + ind0 = dpt.asarray([0], sycl_queue=q1) + ind1 = dpt.asarray([0], sycl_queue=q2) + val0 = dpt.asarray(2, dtype=x.dtype, sycl_queue=q1) + val1 = dpt.asarray(2, dtype=x.dtype, sycl_queue=q2) + + with pytest.raises(dpt.ExecutionPlacementError): + dpt.take(x, ind1, axis=0) + with pytest.raises(dpt.ExecutionPlacementError): + x[ind1] + with pytest.raises(dpt.ExecutionPlacementError): + dpt.put(x, ind1, val0, axis=0) + with pytest.raises(dpt.ExecutionPlacementError): + x[ind1] = val0 + with pytest.raises(dpt.ExecutionPlacementError): + dpt.put(x, ind0, val1, axis=0) + with pytest.raises(dpt.ExecutionPlacementError): + x[ind0] = val1 + + +def test_extract_all_1d(): + get_queue_or_skip() + x = dpt.arange(30, dtype="i4") + sel = dpt.ones(30, dtype="?") + sel[::2] = False + + res = x[sel] + expected_res = dpt.asnumpy(x)[dpt.asnumpy(sel)] + assert (dpt.asnumpy(res) == expected_res).all() + + res2 = dpt.extract(sel, x) + assert (dpt.asnumpy(res2) == expected_res).all() + + # test strided case + x = dpt.arange(15, dtype="i4") + sel_np = np.zeros(15, dtype="?") + np.put(sel_np, np.random.choice(sel_np.size, size=7), True) + sel = dpt.asarray(sel_np) + + res = x[sel[::-1]] + expected_res = dpt.asnumpy(x)[sel_np[::-1]] + assert (dpt.asnumpy(res) == expected_res).all() + + res2 = dpt.extract(sel[::-1], x) + assert (dpt.asnumpy(res2) == expected_res).all() + + +def test_extract_all_2d(): + get_queue_or_skip() + x = dpt.reshape(dpt.arange(30, dtype="i4"), (5, 6)) + sel = dpt.ones(30, dtype="?") + sel[::2] = False + sel = dpt.reshape(sel, x.shape) + + res = x[sel] + expected_res = dpt.asnumpy(x)[dpt.asnumpy(sel)] + assert (dpt.asnumpy(res) == expected_res).all() + + res2 = dpt.extract(sel, x) + assert (dpt.asnumpy(res2) == expected_res).all() + + +def test_extract_2D_axis0(): + get_queue_or_skip() + x = dpt.reshape(dpt.arange(30, dtype="i4"), (5, 6)) + sel = dpt.ones(x.shape[0], dtype="?") + sel[::2] = False + + res = x[sel] + expected_res = dpt.asnumpy(x)[dpt.asnumpy(sel)] + assert (dpt.asnumpy(res) == expected_res).all() + + +def test_extract_2D_axis1(): + get_queue_or_skip() + x = dpt.reshape(dpt.arange(30, dtype="i4"), (5, 6)) + sel = dpt.ones(x.shape[1], dtype="?") + sel[::2] = False + + res = x[:, sel] + expected = dpt.asnumpy(x)[:, dpt.asnumpy(sel)] + assert (dpt.asnumpy(res) == expected).all() + + +def test_extract_begin(): + get_queue_or_skip() + x = dpt.reshape(dpt.arange(3 * 3 * 4 * 4, dtype="i2"), (3, 4, 3, 4)) + y = dpt.permute_dims(x, (2, 0, 3, 1)) + sel = dpt.zeros((3, 3), dtype="?") + sel[0, 0] = True + sel[1, 1] = True + z = y[sel] + expected = dpt.asnumpy(y)[[0, 1], [0, 1]] + assert (dpt.asnumpy(z) == expected).all() + + +def test_extract_end(): + get_queue_or_skip() + x = dpt.reshape(dpt.arange(3 * 3 * 4 * 4, dtype="i2"), (3, 4, 3, 4)) + y = dpt.permute_dims(x, (2, 0, 3, 1)) + sel = dpt.zeros((4, 4), dtype="?") + sel[0, 0] = True + z = y[..., sel] + expected = dpt.asnumpy(y)[..., [0], [0]] + assert (dpt.asnumpy(z) == expected).all() + + +def test_extract_middle(): + get_queue_or_skip() + x = dpt.reshape(dpt.arange(3 * 3 * 4 * 4, dtype="i2"), (3, 4, 3, 4)) + y = dpt.permute_dims(x, (2, 0, 3, 1)) + sel = dpt.zeros((3, 4), dtype="?") + sel[0, 0] = True + z = y[:, sel] + expected = dpt.asnumpy(y)[:, [0], [0], :] + assert (dpt.asnumpy(z) == expected).all() + + +def test_extract_empty_result(): + get_queue_or_skip() + x = dpt.reshape(dpt.arange(3 * 3 * 4 * 4, dtype="i2"), (3, 4, 3, 4)) + y = dpt.permute_dims(x, (2, 0, 3, 1)) + sel = dpt.zeros((3, 4), dtype="?") + z = y[:, sel] + assert z.shape == ( + y.shape[0], + 0, + y.shape[3], + ) + + +def test_place_all_1d(): + get_queue_or_skip() + x = dpt.arange(10, dtype="i2") + sel = dpt.zeros(10, dtype="?") + sel[0::2] = True + val = dpt.zeros(5, dtype=x.dtype) + x[sel] = val + assert (dpt.asnumpy(x) == np.array([0, 1, 0, 3, 0, 5, 0, 7, 0, 9])).all() + dpt.place(x, sel, dpt.asarray([2])) + assert (dpt.asnumpy(x) == np.array([2, 1, 2, 3, 2, 5, 2, 7, 2, 9])).all() + + +def test_place_2d_axis0(): + get_queue_or_skip() + x = dpt.reshape(dpt.arange(12, dtype="i2"), (3, 4)) + sel = dpt.asarray([True, False, True]) + val = dpt.zeros((2, 4), dtype=x.dtype) + x[sel] = val + expected_x = np.stack( + ( + np.zeros(4, dtype="i2"), + np.arange(4, 8, dtype="i2"), + np.zeros(4, dtype="i2"), + ) + ) + assert (dpt.asnumpy(x) == expected_x).all() + + +def test_place_2d_axis1(): + get_queue_or_skip() + x = dpt.reshape(dpt.arange(12, dtype="i2"), (3, 4)) + sel = dpt.asarray([True, False, True, False]) + val = dpt.zeros((3, 2), dtype=x.dtype) + x[:, sel] = val + expected_x = np.array( + [[0, 1, 0, 3], [0, 5, 0, 7], [0, 9, 0, 11]], dtype="i2" + ) + assert (dpt.asnumpy(x) == expected_x).all() + + +def test_place_2d_axis1_scalar(): + get_queue_or_skip() + x = dpt.reshape(dpt.arange(12, dtype="i2"), (3, 4)) + sel = dpt.asarray([True, False, True, False]) + val = dpt.zeros(tuple(), dtype=x.dtype) + x[:, sel] = val + expected_x = np.array( + [[0, 1, 0, 3], [0, 5, 0, 7], [0, 9, 0, 11]], dtype="i2" + ) + assert (dpt.asnumpy(x) == expected_x).all() + + +def test_place_all_slices(): + get_queue_or_skip() + x = dpt.reshape(dpt.arange(12, dtype="i2"), (3, 4)) + sel = dpt.asarray( + [ + [False, True, True, False], + [True, True, False, False], + [False, False, True, True], + ], + dtype="?", + ) + y = dpt.ones_like(x) + y[sel] = x[sel] + + +def test_place_some_slices_begin(): + get_queue_or_skip() + x = dpt.reshape(dpt.arange(3 * 3 * 4 * 4, dtype="i2"), (3, 4, 3, 4)) + y = dpt.permute_dims(x, (2, 0, 3, 1)) + sel = dpt.zeros((3, 3), dtype="?") + sel[0, 0] = True + sel[1, 1] = True + z = y[sel] + w = dpt.zeros_like(y) + w[sel] = z + + +def test_place_some_slices_mid(): + get_queue_or_skip() + x = dpt.reshape(dpt.arange(3 * 3 * 4 * 4, dtype="i2"), (3, 4, 3, 4)) + y = dpt.permute_dims(x, (2, 0, 3, 1)) + sel = dpt.zeros((3, 4), dtype="?") + sel[0, 0] = True + sel[1, 1] = True + z = y[:, sel] + w = dpt.zeros_like(y) + w[:, sel] = z + + +def test_place_some_slices_end(): + get_queue_or_skip() + x = dpt.reshape(dpt.arange(3 * 3 * 4 * 4, dtype="i2"), (3, 4, 3, 4)) + y = dpt.permute_dims(x, (2, 0, 3, 1)) + sel = dpt.zeros((4, 4), dtype="?") + sel[0, 0] = True + sel[1, 1] = True + z = y[:, :, sel] + w = dpt.zeros_like(y) + w[:, :, sel] = z + + +def test_place_cycling(): + get_queue_or_skip() + x = dpt.zeros(10, dtype="f4") + y = dpt.asarray([2, 3]) + sel = dpt.ones(x.size, dtype="?") + dpt.place(x, sel, y) + expected = np.array( + [ + 2, + 3, + ] + * 5, + dtype=x.dtype, + ) + assert (dpt.asnumpy(x) == expected).all() + + +def test_place_subset(): + get_queue_or_skip() + x = dpt.zeros(10, dtype="f4") + y = dpt.ones_like(x) + sel = dpt.ones(x.size, dtype="?") + sel[::2] = False + dpt.place(x, sel, y) + expected = np.array([0, 1, 0, 1, 0, 1, 0, 1, 0, 1], dtype=x.dtype) + assert (dpt.asnumpy(x) == expected).all() + + +def test_place_empty_vals_error(): + get_queue_or_skip() + x = dpt.zeros(10, dtype="f4") + y = dpt.empty((0,), dtype=x.dtype) + sel = dpt.ones(x.size, dtype="?") + sel[::2] = False + with pytest.raises(ValueError): + dpt.place(x, sel, y) + + +def test_place_empty_vals_full_false_mask(): + get_queue_or_skip() + x = dpt.ones(10, dtype="f4") + y = dpt.empty((0,), dtype=x.dtype) + sel = dpt.zeros(x.size, dtype="?") + expected = np.ones(10, dtype=x.dtype) + dpt.place(x, sel, y) + assert (dpt.asnumpy(x) == expected).all() + + +def test_nonzero(): + get_queue_or_skip() + x = dpt.concat((dpt.zeros(3), dpt.ones(4), dpt.zeros(3))) + (i,) = dpt.nonzero(x) + assert (dpt.asnumpy(i) == np.array([3, 4, 5, 6])).all() + + +def test_nonzero_f_contig(): + "See gh-1370" + get_queue_or_skip() + + mask = dpt.zeros((5, 5), dtype="?", order="F") + mask[2, 3] = True + + expected_res = np.nonzero(dpt.asnumpy(mask)) + result = dpt.nonzero(mask) + + for exp, res in zip(expected_res, result): + assert_array_equal(dpt.asnumpy(res), exp) + assert dpt.asnumpy(mask[result]).all() + + +def test_nonzero_compacting(): + """See gh-1370. + Test with input where dimensionality + of iteration space is compacted from 3d to 2d + """ + get_queue_or_skip() + + mask = dpt.zeros((5, 5, 5), dtype="?", order="F") + mask[3, 2, 1] = True + mask_view = mask[..., :3] + + expected_res = np.nonzero(dpt.asnumpy(mask_view)) + result = dpt.nonzero(mask_view) + + for exp, res in zip(expected_res, result): + assert_array_equal(dpt.asnumpy(res), exp) + assert dpt.asnumpy(mask_view[result]).all() + + +def test_assign_scalar(): + get_queue_or_skip() + x = dpt.arange(-5, 5, dtype="i8") + cond = dpt.asarray( + [True, True, True, True, True, False, False, False, False, False] + ) + x[cond] = 0 # no error expected + x[dpt.nonzero(cond)] = -1 + expected = np.array([-1, -1, -1, -1, -1, 0, 1, 2, 3, 4], dtype=x.dtype) + assert (dpt.asnumpy(x) == expected).all() + + +def test_nonzero_large(): + get_queue_or_skip() + m = dpt.full((60, 80), True) + assert m[m].size == m.size + + m = dpt.full((30, 60, 80), True) + assert m[m].size == m.size + + +def test_extract_arg_validation(): + get_queue_or_skip() + with pytest.raises(TypeError): + dpt.extract(None, None) + cond = dpt.ones(10, dtype="?") + with pytest.raises(TypeError): + dpt.extract(cond, None) + q1 = dpctl.SyclQueue() + with pytest.raises(dpt.ExecutionPlacementError): + dpt.extract(cond.to_device(q1), dpt.zeros_like(cond, dtype="u1")) + with pytest.raises(ValueError): + dpt.extract(dpt.ones((2, 3), dtype="?"), dpt.ones((3, 2), dtype="i1")) + + +def test_place_arg_validation(): + get_queue_or_skip() + with pytest.raises(TypeError): + dpt.place(None, None, None) + arr = dpt.zeros(8, dtype="i1") + with pytest.raises(TypeError): + dpt.place(arr, None, None) + cond = dpt.ones(8, dtype="?") + with pytest.raises(TypeError): + dpt.place(arr, cond, None) + vals = dpt.ones_like(arr) + q1 = dpctl.SyclQueue() + with pytest.raises(dpt.ExecutionPlacementError): + dpt.place(arr.to_device(q1), cond, vals) + with pytest.raises(ValueError): + dpt.place(dpt.reshape(arr, (2, 2, 2)), cond, vals) + + +def test_nonzero_arg_validation(): + get_queue_or_skip() + with pytest.raises(TypeError): + dpt.nonzero(list()) + with pytest.raises(ValueError): + dpt.nonzero(dpt.asarray(1)) + + +def test_nonzero_dtype(): + "See gh-1322" + get_queue_or_skip() + x = dpt.ones((3, 4)) + idx, idy = dpt.nonzero(x) + # create array using device's + # default index data type + index_dt = dpt.dtype(ti.default_device_index_type(x.sycl_queue)) + assert idx.dtype == index_dt + assert idy.dtype == index_dt + + +def test_take_empty_axes(): + get_queue_or_skip() + + x = dpt.ones((3, 0, 4, 5, 6), dtype="f4") + inds = dpt.ones(1, dtype="i4") + + with pytest.raises(IndexError): + dpt.take(x, inds, axis=1) + + inds = dpt.ones(0, dtype="i4") + r = dpt.take(x, inds, axis=1) + assert r.shape == x.shape + + +def test_put_empty_axes(): + get_queue_or_skip() + + x = dpt.ones((3, 0, 4, 5, 6), dtype="f4") + inds = dpt.ones(1, dtype="i4") + vals = dpt.zeros((3, 1, 4, 5, 6), dtype="f4") + + with pytest.raises(IndexError): + dpt.put(x, inds, vals, axis=1) + + inds = dpt.ones(0, dtype="i4") + vals = dpt.zeros_like(x) + + with pytest.raises(ValueError): + dpt.put(x, inds, vals, axis=1) + + +def test_put_cast_vals(): + get_queue_or_skip() + + x = dpt.arange(10, dtype="i4") + inds = dpt.arange(7, 10, dtype="i4") + vals = dpt.zeros_like(inds, dtype="f4") + + dpt.put(x, inds, vals) + assert dpt.all(x[7:10] == 0) + + +def test_advanced_integer_indexing_cast_vals(): + get_queue_or_skip() + + x = dpt.arange(10, dtype="i4") + inds = dpt.arange(7, 10, dtype="i4") + vals = dpt.zeros_like(inds, dtype="f4") + + x[inds] = vals + assert dpt.all(x[7:10] == 0) + + +def test_advanced_integer_indexing_empty_axis(): + get_queue_or_skip() + + # getting + x = dpt.ones((3, 0, 4, 5, 6), dtype="f4") + inds = dpt.ones(1, dtype="i4") + with pytest.raises(IndexError): + x[:, inds, ...] + with pytest.raises(IndexError): + x[inds, inds, inds, ...] + + # setting + with pytest.raises(IndexError): + x[:, inds, ...] = 2 + with pytest.raises(IndexError): + x[inds, inds, inds, ...] = 2 + + # empty inds + inds = dpt.ones(0, dtype="i4") + assert x[:, inds, ...].shape == x.shape + assert x[inds, inds, inds, ...].shape == (0, 5, 6) + + vals = dpt.zeros_like(x) + x[:, inds, ...] = vals + vals = dpt.zeros((0, 5, 6), dtype="f4") + x[inds, inds, inds, ...] = vals + + +def test_advanced_integer_indexing_cast_indices(): + get_queue_or_skip() + + inds0 = dpt.asarray([0, 1], dtype="i1") + for ind_dts in (("i1", "i2", "i4"), ("i1", "u4", "i4"), ("u1", "u2", "u8")): + x = dpt.ones((3, 4, 5, 6), dtype="i4") + inds0 = dpt.asarray([0, 1], dtype=ind_dts[0]) + inds1 = dpt.astype(inds0, ind_dts[1]) + x[inds0, inds1, ...] = 2 + assert dpt.all(x[inds0, inds1, ...] == 2) + inds2 = dpt.astype(inds0, ind_dts[2]) + x[inds0, inds1, ...] = 2 + assert dpt.all(x[inds0, inds1, inds2, ...] == 2) + + # fail when float would be required per type promotion + inds0 = dpt.asarray([0, 1], dtype="i1") + inds1 = dpt.astype(inds0, "u4") + inds2 = dpt.astype(inds0, "u8") + x = dpt.ones((3, 4, 5, 6), dtype="i4") + # test getitem + with pytest.raises(ValueError): + x[inds0, inds1, inds2, ...] + # test setitem + with pytest.raises(ValueError): + x[inds0, inds1, inds2, ...] = 1 + + +def test_take_along_axis(): + get_queue_or_skip() + + n0, n1, n2 = 3, 5, 7 + x = dpt.reshape(dpt.arange(n0 * n1 * n2), (n0, n1, n2)) + ind_dt = dpt.__array_namespace_info__().default_dtypes( + device=x.sycl_device + )["indexing"] + ind0 = dpt.ones((1, n1, n2), dtype=ind_dt) + ind1 = dpt.ones((n0, 1, n2), dtype=ind_dt) + ind2 = dpt.ones((n0, n1, 1), dtype=ind_dt) + + y0 = dpt.take_along_axis(x, ind0, axis=0) + assert y0.shape == ind0.shape + y1 = dpt.take_along_axis(x, ind1, axis=1) + assert y1.shape == ind1.shape + y2 = dpt.take_along_axis(x, ind2, axis=2) + assert y2.shape == ind2.shape + + +def test_take_along_axis_validation(): + # validate first argument + with pytest.raises(TypeError): + dpt.take_along_axis(tuple(), list()) + get_queue_or_skip() + n1, n2 = 2, 5 + x = dpt.ones(n1 * n2) + # validate second argument + with pytest.raises(TypeError): + dpt.take_along_axis(x, list()) + x_dev = x.sycl_device + info_ = dpt.__array_namespace_info__() + def_dtypes = info_.default_dtypes(device=x_dev) + ind_dt = def_dtypes["indexing"] + ind = dpt.zeros(1, dtype=ind_dt) + # axis validation + with pytest.raises(ValueError): + dpt.take_along_axis(x, ind, axis=1) + # mode validation + with pytest.raises(ValueError): + dpt.take_along_axis(x, ind, axis=0, mode="invalid") + # same array-ranks validation + with pytest.raises(ValueError): + dpt.take_along_axis(dpt.reshape(x, (n1, n2)), ind) + # check compute-follows-data + q2 = dpctl.SyclQueue(x_dev, property="enable_profiling") + ind2 = dpt.zeros(1, dtype=ind_dt, sycl_queue=q2) + with pytest.raises(dpt.ExecutionPlacementError): + dpt.take_along_axis(x, ind2) + + +def test_put_along_axis(): + get_queue_or_skip() + + n0, n1, n2 = 3, 5, 7 + x = dpt.reshape(dpt.arange(n0 * n1 * n2), (n0, n1, n2)) + ind_dt = dpt.__array_namespace_info__().default_dtypes( + device=x.sycl_device + )["indexing"] + ind0 = dpt.ones((1, n1, n2), dtype=ind_dt) + ind1 = dpt.ones((n0, 1, n2), dtype=ind_dt) + ind2 = dpt.ones((n0, n1, 1), dtype=ind_dt) + + xc = dpt.copy(x) + vals = dpt.ones(ind0.shape, dtype=x.dtype) + dpt.put_along_axis(xc, ind0, vals, axis=0) + assert dpt.all(dpt.take_along_axis(xc, ind0, axis=0) == vals) + + xc = dpt.copy(x) + vals = dpt.ones(ind1.shape, dtype=x.dtype) + dpt.put_along_axis(xc, ind1, vals, axis=1) + assert dpt.all(dpt.take_along_axis(xc, ind1, axis=1) == vals) + + xc = dpt.copy(x) + vals = dpt.ones(ind2.shape, dtype=x.dtype) + dpt.put_along_axis(xc, ind2, vals, axis=2) + assert dpt.all(dpt.take_along_axis(xc, ind2, axis=2) == vals) + + xc = dpt.copy(x) + vals = dpt.ones(ind2.shape, dtype=x.dtype) + dpt.put_along_axis(xc, ind2, dpt.asnumpy(vals), axis=2) + assert dpt.all(dpt.take_along_axis(xc, ind2, axis=2) == vals) + + +def test_put_along_axis_validation(): + # validate first argument + with pytest.raises(TypeError): + dpt.put_along_axis(tuple(), list(), list()) + get_queue_or_skip() + n1, n2 = 2, 5 + x = dpt.ones(n1 * n2) + # validate second argument + with pytest.raises(TypeError): + dpt.put_along_axis(x, list(), list()) + x_dev = x.sycl_device + info_ = dpt.__array_namespace_info__() + def_dtypes = info_.default_dtypes(device=x_dev) + ind_dt = def_dtypes["indexing"] + ind = dpt.zeros(1, dtype=ind_dt) + vals = dpt.zeros(1, dtype=x.dtype) + # axis validation + with pytest.raises(ValueError): + dpt.put_along_axis(x, ind, vals, axis=1) + # mode validation + with pytest.raises(ValueError): + dpt.put_along_axis(x, ind, vals, axis=0, mode="invalid") + # same array-ranks validation + with pytest.raises(ValueError): + dpt.put_along_axis(dpt.reshape(x, (n1, n2)), ind, vals) + # check compute-follows-data + q2 = dpctl.SyclQueue(x_dev, property="enable_profiling") + ind2 = dpt.zeros(1, dtype=ind_dt, sycl_queue=q2) + with pytest.raises(dpt.ExecutionPlacementError): + dpt.put_along_axis(x, ind2, vals) + + +def test_put_along_axis_application(): + get_queue_or_skip() + info_ = dpt.__array_namespace_info__() + def_dtypes = info_.default_dtypes(device=None) + ind_dt = def_dtypes["indexing"] + all_perms = dpt.asarray( + [ + [0, 1, 2, 3], + [0, 2, 1, 3], + [2, 0, 1, 3], + [2, 1, 0, 3], + [1, 0, 2, 3], + [1, 2, 0, 3], + [0, 1, 3, 2], + [0, 2, 3, 1], + [2, 0, 3, 1], + [2, 1, 3, 0], + [1, 0, 3, 2], + [1, 2, 3, 0], + [0, 3, 1, 2], + [0, 3, 2, 1], + [2, 3, 0, 1], + [2, 3, 1, 0], + [1, 3, 0, 2], + [1, 3, 2, 0], + [3, 0, 1, 2], + [3, 0, 2, 1], + [3, 2, 0, 1], + [3, 2, 1, 0], + [3, 1, 0, 2], + [3, 1, 2, 0], + ], + dtype=ind_dt, + ) + p_mats = dpt.zeros((24, 4, 4), dtype=dpt.int64) + vals = dpt.ones((24, 4, 1), dtype=p_mats.dtype) + # form 24 permutation matrices + dpt.put_along_axis(p_mats, all_perms[..., dpt.newaxis], vals, axis=2) + p2 = p_mats @ p_mats + p4 = p2 @ p2 + p8 = p4 @ p4 + expected = dpt.eye(4, dtype=p_mats.dtype)[dpt.newaxis, ...] + assert dpt.all(p8 @ p4 == expected) + + +def check__extract_impl_validation(fn): + x = dpt.ones(10) + ind = dpt.ones(10, dtype="?") + with pytest.raises(TypeError): + fn(list(), ind) + with pytest.raises(TypeError): + fn(x, list()) + q2 = dpctl.SyclQueue(x.sycl_device, property="enable_profiling") + ind2 = dpt.ones(10, dtype="?", sycl_queue=q2) + with pytest.raises(dpt.ExecutionPlacementError): + fn(x, ind2) + with pytest.raises(ValueError): + fn(x, ind, 1) + + +def check__nonzero_impl_validation(fn): + with pytest.raises(TypeError): + fn(list()) + + +def check__take_multi_index(fn): + x = dpt.ones(10) + x_dev = x.sycl_device + info_ = dpt.__array_namespace_info__() + def_dtypes = info_.default_dtypes(device=x_dev) + ind_dt = def_dtypes["indexing"] + ind = dpt.arange(10, dtype=ind_dt) + with pytest.raises(TypeError): + fn(list(), tuple(), 1) + with pytest.raises(ValueError): + fn(x, (ind,), 0, mode=2) + with pytest.raises(ValueError): + fn(x, (None,), 1) + with pytest.raises(IndexError): + fn(x, (x,), 1) + q2 = dpctl.SyclQueue(x.sycl_device, property="enable_profiling") + ind2 = dpt.arange(10, dtype=ind_dt, sycl_queue=q2) + with pytest.raises(dpt.ExecutionPlacementError): + fn(x, (ind2,), 0) + m = dpt.ones((10, 10)) + ind_1 = dpt.arange(10, dtype="i8") + ind_2 = dpt.arange(10, dtype="u8") + with pytest.raises(ValueError): + fn(m, (ind_1, ind_2), 0) + + +def check__place_impl_validation(fn): + with pytest.raises(TypeError): + fn(list(), list(), list()) + x = dpt.ones(10) + with pytest.raises(TypeError): + fn(x, list(), list()) + q2 = dpctl.SyclQueue(x.sycl_device, property="enable_profiling") + mask2 = dpt.ones(10, dtype="?", sycl_queue=q2) + with pytest.raises(dpt.ExecutionPlacementError): + fn(x, mask2, 1) + x2 = dpt.ones((5, 5)) + mask2 = dpt.ones((5, 5), dtype="?") + with pytest.raises(ValueError): + fn(x2, mask2, x2, axis=1) + + +def check__put_multi_index_validation(fn): + with pytest.raises(TypeError): + fn(list(), list(), 0, list()) + x = dpt.ones(10) + inds = dpt.arange(10, dtype="i8") + vals = dpt.zeros(10) + # test inds which is not a tuple/list + fn(x, inds, 0, vals) + x2 = dpt.ones((5, 5)) + ind1 = dpt.arange(5, dtype="i8") + ind2 = dpt.arange(5, dtype="u8") + with pytest.raises(ValueError): + fn(x2, (ind1, ind2), 0, x2) + with pytest.raises(TypeError): + # invalid index type + fn(x2, (ind1, list()), 0, x2) + with pytest.raises(ValueError): + # invalid mode keyword value + fn(x, inds, 0, vals, mode=100) + + +def test__copy_utils(): + import dpnp.tensor._copy_utils as cu + + get_queue_or_skip() + + check__extract_impl_validation(cu._extract_impl) + check__nonzero_impl_validation(cu._nonzero_impl) + check__take_multi_index(cu._take_multi_index) + check__place_impl_validation(cu._place_impl) + check__put_multi_index_validation(cu._put_multi_index) + + +@pytest.mark.parametrize("mode", ["wrap", "clip"]) +def test_take_indices_oob_py_ssize_t(mode): + get_queue_or_skip() + + x = dpt.arange(10, dtype="i4") + inds1 = dpt.full(5, dpt.iinfo(dpt.uint64).max, dtype=dpt.uint64) + inds2 = dpt.full(5, dpt.iinfo(dpt.uint64).max, dtype=dpt.uint64) + + # sweep through a small range of indices + # to check that OOB indices are well-behaved + for i in range(1, 10): + inds2 -= i + r1 = dpt.take(x, inds1, mode=mode) + r2 = dpt.take(x, inds2, mode=mode) + + assert dpt.all(r1 == r2) + + +@pytest.mark.parametrize("mode", ["wrap", "clip"]) +def test_put_indices_oob_py_ssize_t(mode): + get_queue_or_skip() + + x = dpt.full(10, -1, dtype="i4") + inds = dpt.full(1, dpt.iinfo(dpt.uint64).max, dtype=dpt.uint64) + + # OOB inds are positive, so always + # clip to the top of range + for i in range(1, 10): + inds -= i + dpt.put(x, inds, i, mode=mode) + + assert dpt.all(x[:-1] == -1) + assert x[-1] == i + + +def test_take_along_axis_uint64_indices(): + get_queue_or_skip() + + inds = dpt.arange(1, 10, 2, dtype="u8") + x = dpt.tile(dpt.asarray([0, -1], dtype="i4"), 5) + res = dpt.take_along_axis(x, inds) + assert dpt.all(res == -1) + + sh0 = 2 + inds = dpt.broadcast_to(inds, (sh0,) + inds.shape) + x = dpt.broadcast_to(x, (sh0,) + x.shape) + res = dpt.take_along_axis(x, inds, axis=1) + assert dpt.all(res == -1) + + +def test_put_along_axis_uint64_indices(): + get_queue_or_skip() + + inds = dpt.arange(1, 10, 2, dtype="u8") + x = dpt.zeros(10, dtype="i4") + dpt.put_along_axis(x, inds, dpt.asarray(2, dtype=x.dtype)) + expected = dpt.tile(dpt.asarray([0, 2], dtype="i4"), 5) + assert dpt.all(x == expected) + + sh0 = 2 + inds = dpt.broadcast_to(inds, (sh0,) + inds.shape) + x = dpt.zeros((sh0,) + x.shape, dtype="i4") + dpt.put_along_axis(x, inds, dpt.asarray(2, dtype=x.dtype), axis=1) + expected = dpt.tile(dpt.asarray([0, 2], dtype="i4"), (2, 5)) + assert dpt.all(expected == x) + + +@pytest.mark.parametrize("data_dt", _all_dtypes) +@pytest.mark.parametrize("order", ["C", "F"]) +def test_take_out(data_dt, order): + q = get_queue_or_skip() + skip_if_dtype_not_supported(data_dt, q) + + axis = 0 + x = dpt.reshape(_make_3d(data_dt, q), (9, 3), order=order) + ind = dpt.arange(2, dtype="i8", sycl_queue=q) + out_sh = x.shape[:axis] + ind.shape + x.shape[axis + 1 :] + out = dpt.empty(out_sh, dtype=data_dt, sycl_queue=q) + + expected = dpt.take(x, ind, axis=axis) + + dpt.take(x, ind, axis=axis, out=out) + + assert dpt.all(out == expected) + + +@pytest.mark.parametrize("data_dt", _all_dtypes) +@pytest.mark.parametrize("order", ["C", "F"]) +def test_take_out_overlap(data_dt, order): + q = get_queue_or_skip() + skip_if_dtype_not_supported(data_dt, q) + + axis = 0 + x = dpt.reshape(_make_3d(data_dt, q), (9, 3), order=order) + ind = dpt.arange(2, dtype="i8", sycl_queue=q) + out = x[x.shape[axis] - ind.shape[axis] : x.shape[axis], :] + + expected = dpt.take(x, ind, axis=axis) + + dpt.take(x, ind, axis=axis, out=out) + + assert dpt.all(out == expected) + assert dpt.all(x[x.shape[0] - ind.shape[0] : x.shape[0], :] == out) + + +def test_take_out_errors(): + q1 = get_queue_or_skip() + q2 = get_queue_or_skip() + + x = dpt.arange(10, dtype="i4", sycl_queue=q1) + ind = dpt.arange(2, dtype="i4", sycl_queue=q1) + + with pytest.raises(TypeError): + dpt.take(x, ind, out=dict()) + + out_read_only = dpt.empty(ind.shape, dtype=x.dtype, sycl_queue=q1) + out_read_only.flags["W"] = False + with pytest.raises(ValueError): + dpt.take(x, ind, out=out_read_only) + + out_bad_shape = dpt.empty(0, dtype=x.dtype, sycl_queue=q1) + with pytest.raises(ValueError): + dpt.take(x, ind, out=out_bad_shape) + + out_bad_dt = dpt.empty(ind.shape, dtype="i8", sycl_queue=q1) + with pytest.raises(ValueError): + dpt.take(x, ind, out=out_bad_dt) + + out_bad_q = dpt.empty(ind.shape, dtype=x.dtype, sycl_queue=q2) + with pytest.raises(dpt.ExecutionPlacementError): + dpt.take(x, ind, out=out_bad_q) + + +def test_getitem_impl_fn_invalid_inp(): + get_queue_or_skip() + + x = dpt.ones((10, 10), dtype="i4") + + bad_ind_type = (dpt.ones((), dtype="i4"), 2.0) + with pytest.raises(TypeError): + _take_multi_index(x, bad_ind_type, 0, 0) + + no_array_inds = (2, 3) + with pytest.raises(TypeError): + _take_multi_index(x, no_array_inds, 0, 0) diff --git a/dpnp/tests/tensor/test_usm_ndarray_linalg.py b/dpnp/tests/tensor/test_usm_ndarray_linalg.py new file mode 100644 index 000000000000..c28754ca080f --- /dev/null +++ b/dpnp/tests/tensor/test_usm_ndarray_linalg.py @@ -0,0 +1,1030 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import itertools + +import dpctl +import numpy as np +import pytest + +import dpnp.tensor as dpt + +from .helper import ( + get_queue_or_skip, + skip_if_dtype_not_supported, +) + +_numeric_types = [ + "i1", + "u1", + "i2", + "u2", + "i4", + "u4", + "i8", + "u8", + "f2", + "f4", + "f8", + "c8", + "c16", +] + + +def _map_int_to_type(n, dt): + assert isinstance(n, int) + assert n > 0 + if dt == dpt.int8: + return ((n + 128) % 256) - 128 + elif dt == dpt.uint8: + return n % 256 + elif dt == dpt.int16: + return ((n + 32768) % 65536) - 32768 + elif dt == dpt.uint16: + return n % 65536 + return n + + +def test_matrix_transpose(): + get_queue_or_skip() + + X = dpt.reshape(dpt.arange(2 * 3, dtype="i4"), (2, 3)) + res = dpt.matrix_transpose(X) + expected_res = X.mT + + assert expected_res.shape == res.shape + assert expected_res.flags["C"] == res.flags["C"] + assert expected_res.flags["F"] == res.flags["F"] + assert dpt.all(X.mT == res) + + +def test_matrix_transpose_arg_validation(): + get_queue_or_skip() + + X = dpt.empty(5, dtype="i4") + with pytest.raises(ValueError): + dpt.matrix_transpose(X) + + X = {} + with pytest.raises(TypeError): + dpt.matrix_transpose(X) + + X = dpt.empty((5, 5), dtype="i4") + assert isinstance(dpt.matrix_transpose(X), dpt.usm_ndarray) + + +@pytest.mark.parametrize("dtype", _numeric_types) +def test_matmul_simple(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + n, m = 235, 17 + m1 = dpt.zeros((m, n), dtype=dtype) + m2 = dpt.zeros((n, m), dtype=dtype) + + dt = m1.dtype + if dt.kind in "ui": + n1 = min(n, dpt.iinfo(dt).max) + else: + n1 = n + m1[:, :n1] = dpt.ones((m, n1), dtype=dt) + m2[:n1, :] = dpt.ones((n1, m), dtype=dt) + + for k in [1, 2, 3, 4, 7, 8, 9, 15, 16, 17]: + r = dpt.matmul(m1[:k, :], m2[:, :k]) + assert dpt.all(r == dpt.full((k, k), fill_value=n1, dtype=dt)) + + +@pytest.mark.parametrize("dtype", _numeric_types) +def test_matmul_nilpotent1(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + n = 77 + N_mat = dpt.eye(n, k=1, dtype=dtype) + I_mat = dpt.eye(n, dtype=dtype) + R_mat = dpt.eye(n, dtype=dtype) + for _ in range(n + 1): + R_mat = I_mat + dpt.matmul(N_mat, R_mat) + + assert dpt.allclose(dpt.matmul(I_mat - N_mat, R_mat), I_mat) + + +@pytest.mark.parametrize("dtype", _numeric_types) +def test_matmul_nilpotent2(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + n = 128 + u = dpt.ones((n, 1), dtype=dtype) + v = dpt.ones((1, n), dtype=dtype) + + uv = dpt.matmul(u, v) + uv_ref = u * v + + assert dpt.allclose(uv, uv_ref) + + +def test_matmul_null_axis(): + get_queue_or_skip() + n = 3 + + A_mat = dpt.ones((n, 0), dtype="f4") + B_mat = dpt.ones((0, 1), dtype="f4") + + R_mat = dpt.matmul(A_mat, B_mat) + assert R_mat.shape == (n, 1) + + R_mat = dpt.matmul(A_mat, B_mat[:, :0]) + assert R_mat.shape == (n, 0) + + +@pytest.mark.parametrize("dtype", ["i4", "f4"]) +def test_matmul_dims(dtype): + get_queue_or_skip() + + n, m, k, b = 4, 5, 7, 3 + v = dpt.ones(k, dtype=dtype) + m1 = dpt.ones((n, k), dtype=dtype) + m2 = dpt.ones((k, m), dtype=dtype) + st1 = dpt.ones((b, n, k), dtype=dtype) + st2 = dpt.ones((b, k, m), dtype=dtype) + + r = dpt.matmul(v, v) + assert r.shape == () + assert dpt.round(r) == k + + r = dpt.matmul(m1, v) + assert r.shape == (n,) + assert dpt.all(dpt.round(r) == k) + + r = dpt.matmul(v, m2) + assert r.shape == (m,) + assert dpt.all(dpt.round(r) == k) + + r = dpt.matmul(m1, m2) + assert r.shape == ( + n, + m, + ) + assert dpt.all(dpt.round(r) == k) + + r = dpt.matmul(v, st2) + assert r.shape == ( + b, + m, + ) + assert dpt.all(dpt.round(r) == k) + + r = dpt.matmul(st1, v) + assert r.shape == ( + b, + n, + ) + assert dpt.all(dpt.round(r) == k) + + r = dpt.matmul(st1, m2) + assert r.shape == ( + b, + n, + m, + ) + assert dpt.all(dpt.round(r) == k) + + r = dpt.matmul(m1, st2) + assert r.shape == ( + b, + n, + m, + ) + assert dpt.all(dpt.round(r) == k) + + r = dpt.matmul(st1, st2) + assert r.shape == ( + b, + n, + m, + ) + assert dpt.all(dpt.round(r) == k) + + +def test_matmul_arg_validation(): + get_queue_or_skip() + + s1, s2 = dpt.ones(tuple()), dpt.zeros(tuple()) + v1, v2 = dpt.ones(16), dpt.zeros(16) + + with pytest.raises(ValueError): + dpt.matmul(s1, v2) + + with pytest.raises(ValueError): + dpt.matmul(v1, s2) + + with pytest.raises(TypeError): + dpt.matmul(dict(), v2) + + with pytest.raises(TypeError): + dpt.matmul(v2, None) + + +def test_matmul_dims_validation(): + get_queue_or_skip() + + m1 = dpt.ones((16, 16)) + m2 = dpt.ones((16, 16)) + + # contraction dimensions mismatch + with pytest.raises(ValueError): + dpt.matmul(m1[:, :7], m2[:3, :]) + + m1 = dpt.ones((3, 4, 5)) + m2 = dpt.ones((2, 5, 3)) + # broadcasting dimensions mismatch + with pytest.raises(ValueError): + dpt.matmul(m1, m2) + + +def test_matmul_broadcasting(): + get_queue_or_skip() + + for dt1, dt2 in [ + (dpt.int16, dpt.int32), + (dpt.float32, dpt.int16), + (dpt.int32, dpt.uint32), + ]: + m1 = dpt.ones((7, 11, 16), dtype=dt1) + m2 = dpt.ones((16, 13), dtype=dt2) + + r = dpt.matmul(m1, m2[dpt.newaxis, ...]) + + assert r.shape == (7, 11, 13) + + +@pytest.mark.parametrize("dtype", ["i4", "i8", "f4", "c8"]) +def test_matmul_strided(dtype): + get_queue_or_skip() + + m1_shape = (14, 22, 32) + m1_size = 1 + for el in m1_shape: + m1_size = m1_size * el + + m1 = dpt.remainder(dpt.arange(1, m1_size + 1, dtype="i8"), 13) + m1_orig = dpt.reshape(dpt.astype(m1, dtype), m1_shape) + m2_orig = dpt.ones((14, 16, 13), dtype=dtype) + + m1 = m1_orig[::2, ::-2, ::2] + m2 = m2_orig[::2, :, :] + r = dpt.matmul(m1, m2) + + assert r.shape == m1.shape[:2] + m2.shape[-1:] + ref = np.matmul(dpt.asnumpy(m1), dpt.asnumpy(m2)) + assert np.allclose(dpt.asnumpy(r), ref) + + m1 = m1_orig[::2, ::2, ::-2] + m2 = m2_orig[::2, :, :] + r = dpt.matmul(m1, m2) + + assert r.shape == m1.shape[:2] + m2.shape[-1:] + ref = np.matmul(dpt.asnumpy(m1), dpt.asnumpy(m2)) + assert np.allclose(dpt.asnumpy(r), ref) + + m1 = m1_orig[::-2, ::2, ::2] + m2 = m2_orig[::-2, :, :] + r = dpt.matmul(m1, m2) + + assert r.shape == m1.shape[:2] + m2.shape[-1:] + ref = np.matmul(dpt.asnumpy(m1), dpt.asnumpy(m2)) + assert np.allclose(dpt.asnumpy(r), ref) + + +def test_matmul_out(): + get_queue_or_skip() + + m1 = ( + dpt.arange(14, dtype="f4")[:, dpt.newaxis, dpt.newaxis] + + dpt.arange(17, dtype="f4")[dpt.newaxis, :, dpt.newaxis] + + dpt.arange(128, dtype="f4")[dpt.newaxis, dpt.newaxis, :] + ) + assert m1.shape == (14, 17, 128) + m2 = dpt.tile( + dpt.reshape(dpt.asarray([1, 2], dtype="f4"), (2, 1, 1)), (7, 128, 13) + ) + assert m2.shape == (14, 128, 13) + + buf = dpt.zeros((2 * 14, 3 * 17, 13), dtype="f4") + res = dpt.matmul(m1, m2, out=buf[::-2, 1::3, :]) + + assert dpt.allclose(res, buf[::-2, 1::3, :]) + assert dpt.allclose(dpt.zeros_like(res), buf[::-2, 0::3, :]) + assert dpt.allclose(dpt.zeros_like(res), buf[::-2, 2::3, :]) + + m1_np = dpt.asnumpy(m1) + ref = np.matmul(m1_np, dpt.asnumpy(m2)) + assert np.allclose(ref, dpt.asnumpy(res)) + + res = dpt.matmul(m1[:, :10, :10], m1[:, :10, :10].mT, out=m1[:, :10, :10]) + ref = np.matmul( + m1_np[:, :10, :10], np.transpose(m1_np[:, :10, :10], (0, 2, 1)) + ) + assert np.allclose(ref, dpt.asnumpy(res)) + + +def test_matmul_readonly_out(): + get_queue_or_skip() + m = dpt.ones((10, 10), dtype=dpt.int32) + r = dpt.empty_like(m) + r.flags["W"] = False + + with pytest.raises(ValueError): + dpt.matmul(m, m, out=r) + + +def test_matmul_dtype(): + get_queue_or_skip() + + for dt1, dt2 in [ + (dpt.int32, dpt.int16), + (dpt.int16, dpt.int32), + (dpt.float32, dpt.int16), + (dpt.int32, dpt.float32), + ]: + m1 = dpt.ones((10, 10), dtype=dt1) + m2 = dpt.ones((10, 10), dtype=dt2) + + for ord in ["C", "A", "F", "K"]: + r = dpt.matmul(m1, m2, dtype=dpt.float32, order=ord) + assert r.dtype == dpt.float32 + + +@pytest.mark.parametrize("dt1", _numeric_types) +@pytest.mark.parametrize("dt2", _numeric_types) +@pytest.mark.parametrize("order", ["C", "K"]) +def test_matmul_type_promotion(dt1, dt2, order): + get_queue_or_skip() + + q = get_queue_or_skip() + skip_if_dtype_not_supported(dt1, q) + skip_if_dtype_not_supported(dt2, q) + + b, n, k, m = 8, 10, 17, 10 + m1 = dpt.ones((1, n, k), dtype=dt1) + m2 = dpt.ones((b, k, m), dtype=dt2) + expected_dt = dpt.result_type(m1, m2) + + r = dpt.matmul(m1, m2, order=order) + assert r.shape == (b, n, m) + assert r.dtype == expected_dt + + m1 = dpt.ones((b, n, k), dtype=dt1) + m2 = dpt.ones((1, k, m), dtype=dt2) + + r = dpt.matmul(m1, m2, order=order) + assert r.shape == (b, n, m) + assert r.dtype == expected_dt + + m1 = dpt.ones((n, k), dtype=dt1) + m2 = dpt.ones((k, m), dtype=dt2) + + r = dpt.matmul(m1, m2, order=order) + assert r.shape == (n, m) + assert r.dtype == expected_dt + + +def test_matmul_invalid_dtype(): + get_queue_or_skip() + + m1 = dpt.zeros((10, 10), dtype="f4") + m2 = dpt.zeros((10, 10), dtype="f4") + m3 = dpt.zeros((10, 10), dtype="i4") + + with pytest.raises(ValueError): + dpt.matmul(m1, m2, dtype="i4") + + with pytest.raises(ValueError): + dpt.matmul(m1, m3, dtype="i4") + + with pytest.raises(ValueError): + dpt.matmul(m3, m1, dtype="i4") + + +def test_matmul_out_errors(): + q1 = get_queue_or_skip() + q2 = dpctl.SyclQueue() + + sh = (10, 10) + dt = "i4" + m1 = dpt.zeros(sh, dtype=dt, sycl_queue=q1) + m2 = dpt.zeros(sh, dtype=dt, sycl_queue=q1) + + with pytest.raises(TypeError): + dpt.matmul(m1, m2, out=dict()) + + with pytest.raises(ValueError): + dpt.matmul(m1, m2, out=dpt.empty((10,), dtype=dt, sycl_queue=q1)) + + with pytest.raises(ValueError): + dpt.matmul(m1, m2, out=dpt.empty(sh, dtype="f4", sycl_queue=q1)) + + with pytest.raises(dpt.ExecutionPlacementError): + dpt.matmul(m1, m2, out=dpt.empty(sh, dtype=dt, sycl_queue=q2)) + + +def test_matmul_order(): + get_queue_or_skip() + + sh = ( + 10, + 10, + ) + sh2 = tuple(2 * dim for dim in sh) + n = sh[-1] + + for dt1, dt2 in zip(["i4", "i4", "f4"], ["i4", "f4", "i4"]): + ar1 = dpt.ones(sh, dtype=dt1, order="C") + ar2 = dpt.ones(sh, dtype=dt2, order="C") + r1 = dpt.matmul(ar1, ar2, order="C") + assert r1.flags.c_contiguous + r2 = dpt.matmul(ar1, ar2, order="F") + assert r2.flags.f_contiguous + r3 = dpt.matmul(ar1, ar2, order="A") + assert r3.flags.c_contiguous + r4 = dpt.matmul(ar1, ar2, order="K") + assert r4.flags.c_contiguous + + ar1 = dpt.ones(sh, dtype=dt1, order="F") + ar2 = dpt.ones(sh, dtype=dt2, order="F") + r1 = dpt.matmul(ar1, ar2, order="C") + assert r1.flags.c_contiguous + r2 = dpt.matmul(ar1, ar2, order="F") + assert r2.flags.f_contiguous + r3 = dpt.matmul(ar1, ar2, order="A") + assert r3.flags.f_contiguous + r4 = dpt.matmul(ar1, ar2, order="K") + assert r4.flags.f_contiguous + + ar1 = dpt.ones(sh2, dtype=dt1, order="C")[:10, ::-2] + ar2 = dpt.ones(sh2, dtype=dt2, order="C")[:10, ::-2] + r4 = dpt.matmul(ar1, ar2, order="K") + assert r4.strides == (n, -1) + r5 = dpt.matmul(ar1, ar2, order="C") + assert r5.strides == (n, 1) + + ar1 = dpt.ones(sh2, dtype=dt1, order="C")[:10, ::-2].mT + ar2 = dpt.ones(sh2, dtype=dt2, order="C")[:10, ::-2].mT + r4 = dpt.matmul(ar1, ar2, order="K") + assert r4.strides == (-1, n) + r5 = dpt.matmul(ar1, ar2, order="C") + assert r5.strides == (n, 1) + + +def test_matmul_invalid_order(): + get_queue_or_skip() + + sh = ( + 10, + 10, + ) + dt = "i4" + + ar1 = dpt.ones(sh, dtype=dt, order="C") + ar2 = dpt.ones(sh, dtype=dt, order="C") + r = dpt.matmul(ar1, ar2, order="invalid") + assert r.flags.c_contiguous + + ar1 = dpt.ones(sh, dtype=dt, order="F") + ar2 = dpt.ones(sh, dtype=dt, order="F") + r = dpt.matmul(ar1, ar2, order="invalid") + assert r.flags.f_contiguous + + +def test_matmul_compute_follows_data(): + q1 = get_queue_or_skip() + q2 = dpctl.SyclQueue() + + sh = ( + 10, + 10, + ) + dt = "i4" + m1 = dpt.zeros(sh, dtype=dt, sycl_queue=q1) + m2 = dpt.zeros(sh, dtype=dt, sycl_queue=q2) + + with pytest.raises(dpt.ExecutionPlacementError): + dpt.matmul(m1, m2) + + +def test_matmul_inplace_broadcasting(): + get_queue_or_skip() + + sh = (3, 5, 5) + dt = "i4" + + m1 = dpt.ones((3, 5, 5), dtype=dt) + m2 = dpt.ones((1, 5, 5), dtype=dt) + m1 @= m2 + assert dpt.all(m1 == dpt.full(sh, 5, dtype=dt)) + + +def test_matmul_prepend_dims(): + get_queue_or_skip() + + n = 5 + for dt1, dt2 in [ + (dpt.int32, dpt.int32), + (dpt.int32, dpt.int64), + (dpt.int64, dpt.int32), + (dpt.int32, dpt.uint32), + ]: + m = dpt.ones((n, 4), dtype=dt1) + v = dpt.ones((4,), dtype=dt2) + r = dpt.matmul(m, v) + assert r.shape == (n,) + + r = dpt.matmul(v, m.mT) + assert r.shape == (n,) + + +def test_matmul_inplace_same_tensors(): + get_queue_or_skip() + + n = 5 + sh = ( + n, + n, + ) + + ar1 = dpt.ones(sh, dtype="i4") + ar1 @= ar1 + assert dpt.all(ar1 == dpt.full(sh, n, dtype="i4")) + + ar1 = dpt.ones(sh, dtype="i8") + ar2 = dpt.ones(sh, dtype="i4") + dpt.matmul(ar1, ar2, out=ar1) + assert dpt.all(ar1 == dpt.full(sh, n, dtype=ar1.dtype)) + + ar1 = dpt.ones(sh, dtype="i4") + ar2 = dpt.ones(sh, dtype="i8") + dpt.matmul(ar1, ar2, out=ar2) + assert dpt.all(ar2 == dpt.full(sh, n, dtype=ar2.dtype)) + + +@pytest.fixture +def random_matrix(): + rs = np.random.RandomState(seed=123456) + m_np = rs.randint(low=0, high=6, size=(400, 400)) + return m_np + + +@pytest.mark.parametrize("dtype", _numeric_types) +def test_matmul_largish_square(dtype, random_matrix): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + m_np = random_matrix.astype(dtype) + x_np = np.matmul(m_np.T, m_np) + + m = dpt.asarray(m_np) + mT = dpt.asarray(m.mT, copy=True, order="C") + x1 = dpt.matmul(m.mT, m) + x2 = dpt.matmul(mT, m) + + tol = 0 + if dpt.isdtype(x2.dtype, ("real floating", "complex floating")): + tol = 32 * dpt.finfo(x2.dtype).eps + + assert dpt.allclose(x1, x2, atol=tol, rtol=tol) + assert dpt.allclose(x1, dpt.asarray(x_np), atol=tol, rtol=tol) + + # check stided input + m_np = m_np[:-1, :-1] + x_np = np.matmul(m_np.T, m_np) + + m = m[:-1, :-1] + mT = dpt.asarray(m.mT, copy=True, order="C") + x1 = dpt.matmul(m.mT, m) + x2 = dpt.matmul(mT, m) + + assert dpt.allclose(x1, x2, atol=tol, rtol=tol) + assert dpt.allclose(x1, dpt.asarray(x_np), atol=tol, rtol=tol) + + +@pytest.mark.parametrize("dtype", _numeric_types) +def test_matmul_largish_rect(dtype, random_matrix): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + m_np = random_matrix.astype(dtype)[:, :-1] + x_np = np.matmul(m_np.T[:-2, :], m_np) + + m = dpt.asarray(m_np) + mmT = m.mT[:-2, :] + mT = dpt.asarray(mmT, copy=True, order="C") + x1 = dpt.matmul(mmT, m) + x2 = dpt.matmul(mT, m) + + tol = 0 + if dpt.isdtype(x2.dtype, ("real floating", "complex floating")): + tol = 32 * dpt.finfo(x2.dtype).eps + + assert dpt.allclose(x1, x2, atol=tol, rtol=tol) + assert dpt.allclose(x1, dpt.asarray(x_np), atol=tol, rtol=tol) + + m_np = m_np[:-1, :-1] + x_np = np.matmul(m_np.T[:-2, :], m_np) + + m = m[:-1, :-1] + mmT = m.mT[:-2, :] + mT = dpt.asarray(mmT, copy=True, order="C") + x1 = dpt.matmul(mmT, m) + x2 = dpt.matmul(mT, m) + + assert dpt.allclose(x1, x2, atol=tol, rtol=tol) + assert dpt.allclose(x1, dpt.asarray(x_np), atol=tol, rtol=tol) + + +@pytest.mark.parametrize("dtype", _numeric_types) +def test_tensordot_outer(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + t1 = dpt.ones((3, 8), dtype=dtype) + t2 = dpt.ones((4, 12), dtype=dtype) + + r = dpt.tensordot(t1, t2, axes=0) + assert r.shape == t1.shape + t2.shape + assert dpt.allclose(r, dpt.ones_like(r)) + + +@pytest.mark.parametrize("dtype", _numeric_types) +def test_tensordot_inner(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + t1 = dpt.ones((3, 8), dtype=dtype) + t2 = dpt.ones((4, 8), dtype=dtype) + + r = dpt.tensordot(t1, t2.mT, axes=1) + assert r.shape == t1.shape[:1] + t2.shape[:1] + assert dpt.allclose(r, dpt.full_like(r, fill_value=t1.shape[1])) + + +@pytest.mark.parametrize("dtype", _numeric_types) +def test_tensordot_double(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + t1 = dpt.ones((2, 4, 8), dtype=dtype) + t2 = dpt.ones((3, 4, 8), dtype=dtype) + + r = dpt.tensordot(t1, dpt.permute_dims(t2, (1, 2, 0)), axes=2) + assert r.shape == t1.shape[:1] + t2.shape[:1] + expected = dpt.prod(dpt.asarray(t1.shape[1:])) + assert dpt.allclose(r, dpt.full_like(r, fill_value=expected)) + + +@pytest.mark.parametrize("dtype", ["i4", "f4"]) +def test_tensordot_axes_sequence(dtype): + get_queue_or_skip() + + r = 4 + t1 = dpt.ones((2, 2, 4, 3), dtype=dtype) + t2 = dpt.ones((3, 2, 4, 3), dtype=dtype) + + assert len(t1.shape) == r + assert len(t2.shape) == r + + expected = dpt.prod(dpt.asarray(t1.shape[1:])) + ps1 = itertools.permutations(range(r)) + ps2 = itertools.permutations(range(r)) + + for p1 in ps1: + assert len(p1) == r + inv_p1 = sorted(range(r), key=p1.__getitem__) + u1 = dpt.permute_dims(t1, p1) + x1_axes = inv_p1[1:] + for p2 in ps2: + inv_p2 = sorted(range(r), key=p2.__getitem__) + u2 = dpt.permute_dims(t2, p2) + x2_axes = inv_p2[1:] + + tdr = dpt.tensordot(u1, u2, axes=(x1_axes, x2_axes)) + assert tdr.shape == t1.shape[:1] + t2.shape[:1] + assert dpt.allclose(tdr, dpt.full_like(tdr, fill_value=expected)) + + +def test_tensordot_validation(): + get_queue_or_skip() + + with pytest.raises(TypeError): + dpt.tensordot(dict(), dict()) + + t1 = dpt.empty((10, 10, 10)) + with pytest.raises(TypeError): + dpt.tensordot(t1, dict()) + + t2 = dpt.empty((10, 10, 10)) + q = dpctl.SyclQueue(t2.sycl_context, t2.sycl_device, property="in_order") + with pytest.raises(dpt.ExecutionPlacementError): + dpt.tensordot(t1, t2.to_device(q)) + + invalid_axes = ( + 1, + 2, + 3, + ) + with pytest.raises(ValueError): + dpt.tensordot(t1, t2, axes=invalid_axes) + + invalid_axes = 5.2 + with pytest.raises(TypeError): + dpt.tensordot(t1, t2, axes=invalid_axes) + + invalid_axes = ( + (1,), + ( + 0, + 2, + ), + ) + with pytest.raises(ValueError): + dpt.tensordot(t1, t2, axes=invalid_axes) + + with pytest.raises(ValueError): + dpt.tensordot(t1[..., :5], t2) + + +def test_tensordot_promotion(): + get_queue_or_skip() + + t1 = dpt.zeros((10, 10), dtype="i4") + t2 = dpt.zeros((10, 10), dtype="i8") + + r1 = dpt.tensordot(t1, t2) + assert r1.dtype == t2.dtype + + r2 = dpt.tensordot(t2, t1) + assert r2.dtype == t2.dtype + + t3 = dpt.zeros((10, 10), dtype="u4") + r3 = dpt.tensordot(t1, t3) + assert r3.dtype == dpt.result_type(t1, t3) + + +def test_tensordot_axes_errors(): + get_queue_or_skip() + + m1 = dpt.zeros((10, 10), dtype="i4") + m2 = dpt.zeros((10, 10), dtype="i4") + + with pytest.raises(ValueError): + dpt.tensordot(m1, m2, axes=-1) + + +# tests for gh-1570 +def test_tensordot_gemm_small_k_m(): + get_queue_or_skip() + + x1 = dpt.asarray(1, dtype="i2") + x2 = dpt.asarray([0, 1, 0, 0], dtype="i2") + + res = dpt.tensordot(x1, x2, axes=0) + assert dpt.all(x2 == res) + + +@pytest.mark.parametrize("dtype", _numeric_types) +def test_vecdot_1d(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + n = 511 + v1 = dpt.ones(n, dtype=dtype) + + v2 = dpt.ones(n, dtype=dtype) + + r = dpt.vecdot(v1, v2) + expected_value = _map_int_to_type(n, r.dtype) + assert r == expected_value + + +@pytest.mark.parametrize("dtype", _numeric_types) +def test_vecdot_3d(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + m1, m2, n = 7, 3, 511 + v1 = dpt.ones((m1, m2, n), dtype=dtype) + + v2 = dpt.ones((m1, m2, n), dtype=dtype) + + r = dpt.vecdot(v1, v2) + + assert r.shape == ( + m1, + m2, + ) + expected_value = _map_int_to_type(n, r.dtype) + assert dpt.all(r == expected_value) + + +@pytest.mark.parametrize("dtype", _numeric_types) +def test_vecdot_axis(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + m1, m2, n = 7, 3, 511 + v1 = dpt.ones((m1, n, m2), dtype=dtype) + + v2 = dpt.ones((m1, n, m2), dtype=dtype) + + r = dpt.vecdot(v1, v2, axis=-2) + + assert r.shape == ( + m1, + m2, + ) + expected_value = _map_int_to_type(n, r.dtype) + assert dpt.all(r == expected_value) + + +@pytest.mark.parametrize("dtype", _numeric_types) +def test_vecdot_strided(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + m1, m2, n = 7, 3, 511 + list1 = [1, 0, 2, 0] + pattern1 = dpt.asarray(list1, dtype=dtype) + n_padded1 = pattern1.size * (1 + ((n - 1) // pattern1.size)) + v1 = dpt.tile(dpt.reshape(pattern1, (1, -1, 1)), (m1, n_padded1, m2))[ + ::-1, :n, : + ] + + list2 = [1, 2, 1, 2] + pattern2 = dpt.asarray(list2, dtype=dtype) + n_padded2 = pattern2.size * (1 + ((n - 1) // pattern2.size)) + v2 = dpt.tile(dpt.reshape(pattern2, (1, -1, 1)), (m1, n_padded2, m2))[ + :, :n, ::-1 + ] + + r = dpt.vecdot(v1, v2, axis=-2) + + ref = sum( + el1 * el2 + for el1, el2 in zip((list1 * n_padded1)[:n], (list2 * n_padded1)[:n]) + ) + + assert r.shape == ( + m1, + m2, + ) + ref = _map_int_to_type(ref, r.dtype) + assert dpt.all(r == ref) + + +def test_vector_arg_validation(): + get_queue_or_skip() + + s1, s2 = dpt.ones(tuple()), dpt.zeros(tuple()) + v1, v2 = dpt.ones(16), dpt.zeros(16) + + with pytest.raises(ValueError): + dpt.vecdot(s1, v2) + + with pytest.raises(ValueError): + dpt.vecdot(v1, s2) + + with pytest.raises(TypeError): + dpt.vecdot(dict(), v2) + + with pytest.raises(TypeError): + dpt.vecdot(v2, None) + + with pytest.raises(ValueError): + dpt.vecdot(v1[:5], v2[:4]) + + with pytest.raises(ValueError): + dpt.vecdot(v1, v2, axis=2) + + with pytest.raises(ValueError): + dpt.vecdot(v1, v2, axis=-2) + + q = dpctl.SyclQueue( + v2.sycl_context, v2.sycl_device, property="enable_profiling" + ) + with pytest.raises(dpt.ExecutionPlacementError): + dpt.vecdot(v1, v2.to_device(q)) + + m1 = dpt.empty((10, 5)) + m2 = dpt.empty((5, 5)) + with pytest.raises(ValueError): + dpt.vecdot(m1, m2, axis=-1) + + +def test_vecdot_broadcast(): + get_queue_or_skip() + + for dt1, dt2 in [ + (dpt.int32, dpt.int32), + (dpt.int32, dpt.int64), + (dpt.int64, dpt.int32), + (dpt.int32, dpt.uint32), + ]: + m1 = dpt.zeros((1, 5), dtype=dt1) + m2 = dpt.zeros((5, 5), dtype=dt2) + r1 = dpt.vecdot(m1, m2, axis=-1) + r2 = dpt.vecdot(m2, m1, axis=-1) + assert r1.shape == r2.shape + + +@pytest.mark.parametrize("dt1", _numeric_types) +@pytest.mark.parametrize("dt2", _numeric_types) +def test_vecdot_type_promotion(dt1, dt2): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dt1, q) + skip_if_dtype_not_supported(dt2, q) + + v1 = dpt.ones(128, dtype=dt1) + v2 = dpt.ones(128, dtype=dt2) + + r = dpt.vecdot(v1, v2) + mul = v1 * v2 + assert r.shape == () + assert r.dtype == mul.dtype + assert dpt.allclose(r, dpt.sum(mul, dtype=mul.dtype)) + + +def test_vecdot_broadcast_o1_buffer(): + get_queue_or_skip() + + v1 = dpt.arange(10, dtype="i2") + v2 = dpt.ones((5, 10), dtype="i4") + + res1 = dpt.vecdot(v1, v2) + assert res1.shape == (5,) + + res2 = dpt.vecdot(v2, v1) + assert res2.shape == (5,) + + +def test_vecdot_contig_small(): + get_queue_or_skip() + + n = 1 + for dt in [dpt.int16, dpt.int32, dpt.complex64]: + v1 = dpt.zeros((10, n), dtype=dt) + v2 = dpt.ones_like(v1, dtype=dt) + v1[-1] = 1 + res = dpt.vecdot(v1, v2) + assert dpt.all(res[:-1] == 0) + assert res[-1] == n + + +def test_matmul_out_appended_axes(): + get_queue_or_skip() + + n0, n1, n2 = 4, 10, 5 + # vm + x1 = dpt.ones(n1, dtype="i4") + x2 = dpt.ones((n0, n1, n2), dtype="i4") + out = dpt.empty((n0, n2), dtype="i4") + + dpt.matmul(x1, x2, out=out) + assert dpt.all(out == n1) + + # mv + x2 = x2.mT + x1, x2 = x2, x1 + dpt.matmul(x1, x2, out=out) + assert dpt.all(out == n1) + + # vv + x1 = dpt.ones(n1, dtype="i4") + out = dpt.empty((), dtype="i4") + dpt.matmul(x1, x2, out=out) + assert out == n1 diff --git a/dpnp/tests/tensor/test_usm_ndarray_manipulation.py b/dpnp/tests/tensor/test_usm_ndarray_manipulation.py new file mode 100644 index 000000000000..0375bb446370 --- /dev/null +++ b/dpnp/tests/tensor/test_usm_ndarray_manipulation.py @@ -0,0 +1,1608 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import itertools + +import dpctl +import numpy as np +import pytest +from numpy.testing import assert_, assert_array_equal, assert_raises_regex + +import dpnp.tensor as dpt +from dpnp.tensor._numpy_helper import AxisError + +from .helper import get_queue_or_skip + + +def test_permute_dims_incorrect_type(): + X_list = list([[1, 2, 3], [4, 5, 6]]) + X_tuple = tuple(X_list) + Xnp = np.array(X_list) + + pytest.raises(TypeError, dpt.permute_dims, X_list, (1, 0)) + pytest.raises(TypeError, dpt.permute_dims, X_tuple, (1, 0)) + pytest.raises(TypeError, dpt.permute_dims, Xnp, (1, 0)) + + +def test_permute_dims_empty_array(): + q = get_queue_or_skip() + + Xnp = np.empty((10, 0)) + X = dpt.asarray(Xnp, sycl_queue=q) + Y = dpt.permute_dims(X, (1, 0)) + Ynp = np.transpose(Xnp, (1, 0)) + assert_array_equal(Ynp, dpt.asnumpy(Y)) + + +def test_permute_dims_0d_1d(): + q = get_queue_or_skip() + + Xnp_0d = np.array(1, dtype="int64") + X_0d = dpt.asarray(Xnp_0d, sycl_queue=q) + Y_0d = dpt.permute_dims(X_0d, ()) + assert_array_equal(dpt.asnumpy(Y_0d), dpt.asnumpy(X_0d)) + + Xnp_1d = np.random.randint(0, 2, size=6, dtype="int64") + X_1d = dpt.asarray(Xnp_1d, sycl_queue=q) + Y_1d = dpt.permute_dims(X_1d, (0)) + assert_array_equal(dpt.asnumpy(Y_1d), dpt.asnumpy(X_1d)) + + pytest.raises(ValueError, dpt.permute_dims, X_1d, ()) + pytest.raises(AxisError, dpt.permute_dims, X_1d, (1)) + pytest.raises(ValueError, dpt.permute_dims, X_1d, (1, 0)) + pytest.raises( + ValueError, dpt.permute_dims, dpt.reshape(X_1d, (2, 3)), (1, 1) + ) + + +@pytest.mark.parametrize("shapes", [(2, 2), (1, 4), (3, 3, 3), (4, 1, 3)]) +def test_permute_dims_2d_3d(shapes): + q = get_queue_or_skip() + + Xnp_size = np.prod(shapes) + + Xnp = np.random.randint(0, 2, size=Xnp_size, dtype="int64").reshape(shapes) + X = dpt.asarray(Xnp, sycl_queue=q) + X_ndim = X.ndim + if X_ndim == 2: + Y = dpt.permute_dims(X, (1, 0)) + Ynp = np.transpose(Xnp, (1, 0)) + elif X_ndim == 3: + X = dpt.asarray(Xnp, sycl_queue=q) + Y = dpt.permute_dims(X, (2, 0, 1)) + Ynp = np.transpose(Xnp, (2, 0, 1)) + assert_array_equal(Ynp, dpt.asnumpy(Y)) + + +def test_expand_dims_incorrect_type(): + X_list = [1, 2, 3, 4, 5] + with pytest.raises(TypeError): + dpt.permute_dims(X_list, axis=1) + + +def test_expand_dims_0d(): + q = get_queue_or_skip() + + Xnp = np.array(1, dtype="int64") + X = dpt.asarray(Xnp, sycl_queue=q) + + Y = dpt.expand_dims(X, axis=0) + Ynp = np.expand_dims(Xnp, axis=0) + assert_array_equal(Ynp, dpt.asnumpy(Y)) + + Y = dpt.expand_dims(X, axis=-1) + Ynp = np.expand_dims(Xnp, axis=-1) + assert_array_equal(Ynp, dpt.asnumpy(Y)) + + pytest.raises(AxisError, dpt.expand_dims, X, axis=1) + pytest.raises(AxisError, dpt.expand_dims, X, axis=-2) + + +@pytest.mark.parametrize("shapes", [(3,), (3, 3), (3, 3, 3)]) +def test_expand_dims_1d_3d(shapes): + q = get_queue_or_skip() + + Xnp_size = np.prod(shapes) + + Xnp = np.random.randint(0, 2, size=Xnp_size, dtype="int64").reshape(shapes) + X = dpt.asarray(Xnp, sycl_queue=q) + shape_len = len(shapes) + for axis in range(-shape_len - 1, shape_len): + Y = dpt.expand_dims(X, axis=axis) + Ynp = np.expand_dims(Xnp, axis=axis) + assert_array_equal(Ynp, dpt.asnumpy(Y)) + + pytest.raises(AxisError, dpt.expand_dims, X, axis=shape_len + 1) + pytest.raises(AxisError, dpt.expand_dims, X, axis=-shape_len - 2) + + +@pytest.mark.parametrize( + "axes", [(0, 1, 2), (0, -1, -2), (0, 3, 5), (0, -3, -5)] +) +def test_expand_dims_tuple(axes): + q = get_queue_or_skip() + + Xnp = np.empty((3, 3, 3), dtype="u1") + X = dpt.asarray(Xnp, sycl_queue=q) + Y = dpt.expand_dims(X, axis=axes) + Ynp = np.expand_dims(Xnp, axis=axes) + assert_array_equal(Ynp, dpt.asnumpy(Y)) + + +def test_expand_dims_incorrect_tuple(): + try: + X = dpt.empty((3, 3, 3), dtype="i4") + except dpctl.SyclDeviceCreationError: + pytest.skip("No SYCL devices available") + with pytest.raises(AxisError): + dpt.expand_dims(X, axis=(0, -6)) + with pytest.raises(AxisError): + dpt.expand_dims(X, axis=(0, 5)) + + with pytest.raises(ValueError): + dpt.expand_dims(X, axis=(1, 1)) + + +def test_squeeze_incorrect_type(): + X_list = [1, 2, 3, 4, 5] + with pytest.raises(TypeError): + dpt.permute_dims(X_list, 1) + + +def test_squeeze_0d(): + q = get_queue_or_skip() + + Xnp = np.array(1) + X = dpt.asarray(Xnp, sycl_queue=q) + Y = dpt.squeeze(X) + Ynp = Xnp.squeeze() + assert_array_equal(Ynp, dpt.asnumpy(Y)) + + Y = dpt.squeeze(X, 0) + Ynp = Xnp.squeeze(0) + assert_array_equal(Ynp, dpt.asnumpy(Y)) + + Y = dpt.squeeze(X, (0)) + Ynp = Xnp.squeeze(0) + assert_array_equal(Ynp, dpt.asnumpy(Y)) + + Y = dpt.squeeze(X, -1) + Ynp = Xnp.squeeze(-1) + assert_array_equal(Ynp, dpt.asnumpy(Y)) + + pytest.raises(AxisError, dpt.squeeze, X, 1) + pytest.raises(AxisError, dpt.squeeze, X, -2) + pytest.raises(AxisError, dpt.squeeze, X, (1)) + pytest.raises(AxisError, dpt.squeeze, X, (-2)) + pytest.raises(ValueError, dpt.squeeze, X, (0, 0)) + + +@pytest.mark.parametrize( + "shapes", + [ + (0), + (1), + (1, 2), + (2, 1), + (1, 1), + (2, 2), + (1, 0), + (0, 1), + (1, 2, 1), + (2, 1, 2), + (2, 2, 2), + (1, 1, 1), + (1, 0, 1), + (0, 1, 0), + ], +) +def test_squeeze_without_axes(shapes): + q = get_queue_or_skip() + + Xnp = np.empty(shapes, dtype="u1") + X = dpt.asarray(Xnp, sycl_queue=q) + Y = dpt.squeeze(X) + Ynp = Xnp.squeeze() + assert_array_equal(Ynp, dpt.asnumpy(Y)) + + +@pytest.mark.parametrize("axes", [0, 2, (0), (2), (0, 2)]) +def test_squeeze_axes_arg(axes): + q = get_queue_or_skip() + + Xnp = np.array([[[1], [2], [3]]], dtype="u1") + X = dpt.asarray(Xnp, sycl_queue=q) + Y = dpt.squeeze(X, axes) + Ynp = Xnp.squeeze(axes) + assert_array_equal(Ynp, dpt.asnumpy(Y)) + + +@pytest.mark.parametrize("axes", [1, -2, (1), (-2), (0, 0), (1, 1)]) +def test_squeeze_axes_arg_error(axes): + q = get_queue_or_skip() + + Xnp = np.array([[[1], [2], [3]]], dtype="u1") + X = dpt.asarray(Xnp, sycl_queue=q) + pytest.raises(ValueError, dpt.squeeze, X, axes) + + +@pytest.mark.parametrize( + "data", + [ + [np.array(0, dtype="u1"), (0,)], + [np.array(0, dtype="u1"), (1,)], + [np.array(0, dtype="u1"), (3,)], + [np.ones(1, dtype="u1"), (1,)], + [np.ones(1, dtype="u1"), (2,)], + [np.ones(1, dtype="u1"), (1, 2, 3)], + [np.arange(3, dtype="u1"), (3,)], + [np.arange(3, dtype="u1"), (1, 3)], + [np.arange(3, dtype="u1"), (2, 3)], + [np.ones(0, dtype="u1"), 0], + [np.ones(1, dtype="u1"), 1], + [np.ones(1, dtype="u1"), 2], + [np.ones(1, dtype="u1"), (0,)], + [np.ones((1, 2), dtype="u1"), (0, 2)], + [np.ones((2, 1), dtype="u1"), (2, 0)], + ], +) +def test_broadcast_to_succeeds(data): + q = get_queue_or_skip() + + Xnp, target_shape = data + X = dpt.asarray(Xnp, sycl_queue=q) + Y = dpt.broadcast_to(X, target_shape) + Ynp = np.broadcast_to(Xnp, target_shape) + assert_array_equal(dpt.asnumpy(Y), Ynp) + + +@pytest.mark.parametrize( + "data", + [ + [(0,), ()], + [(1,), ()], + [(3,), ()], + [(3,), (1,)], + [(3,), (2,)], + [(3,), (4,)], + [(1, 2), (2, 1)], + [(1, 1), (1,)], + [(1,), -1], + [(1,), (-1,)], + [(1, 2), (-1, 2)], + ], +) +def test_broadcast_to_raises(data): + q = get_queue_or_skip() + + orig_shape, target_shape = data + Xnp = np.zeros(orig_shape, dtype="i1") + X = dpt.asarray(Xnp, sycl_queue=q) + pytest.raises(ValueError, dpt.broadcast_to, X, target_shape) + + +def assert_broadcast_correct(input_shapes): + q = get_queue_or_skip() + np_arrays = [np.zeros(s, dtype="i1") for s in input_shapes] + out_np_arrays = np.broadcast_arrays(*np_arrays) + usm_arrays = [dpt.asarray(Xnp, sycl_queue=q) for Xnp in np_arrays] + out_usm_arrays = dpt.broadcast_arrays(*usm_arrays) + for Xnp, X in zip(out_np_arrays, out_usm_arrays): + assert_array_equal( + Xnp, dpt.asnumpy(X), err_msg=f"Failed for {input_shapes})" + ) + + +def assert_broadcast_arrays_raise(input_shapes): + q = get_queue_or_skip() + usm_arrays = [dpt.asarray(np.zeros(s), sycl_queue=q) for s in input_shapes] + pytest.raises(ValueError, dpt.broadcast_arrays, *usm_arrays) + + +def test_broadcast_arrays_same(): + q = get_queue_or_skip() + Xnp = np.arange(10) + Ynp = np.arange(10) + res_Xnp, res_Ynp = np.broadcast_arrays(Xnp, Ynp) + X = dpt.asarray(Xnp, sycl_queue=q) + Y = dpt.asarray(Ynp, sycl_queue=q) + res_X, res_Y = dpt.broadcast_arrays(X, Y) + assert_array_equal(res_Xnp, dpt.asnumpy(res_X)) + assert_array_equal(res_Ynp, dpt.asnumpy(res_Y)) + + +def test_broadcast_arrays_one_off(): + q = get_queue_or_skip() + Xnp = np.array([[1, 2, 3]]) + Ynp = np.array([[1], [2], [3]]) + res_Xnp, res_Ynp = np.broadcast_arrays(Xnp, Ynp) + X = dpt.asarray(Xnp, sycl_queue=q) + Y = dpt.asarray(Ynp, sycl_queue=q) + res_X, res_Y = dpt.broadcast_arrays(X, Y) + assert_array_equal(res_Xnp, dpt.asnumpy(res_X)) + assert_array_equal(res_Ynp, dpt.asnumpy(res_Y)) + + +@pytest.mark.parametrize( + "shapes", + [ + (), + (1,), + (3,), + (0, 1), + (0, 3), + (1, 0), + (3, 0), + (1, 3), + (3, 1), + (3, 3), + ], +) +def test_broadcast_arrays_same_shapes(shapes): + for shape in shapes: + single_input_shapes = [shape] + assert_broadcast_correct(single_input_shapes) + double_input_shapes = [shape, shape] + assert_broadcast_correct(double_input_shapes) + triple_input_shapes = [shape, shape, shape] + assert_broadcast_correct(triple_input_shapes) + + +@pytest.mark.parametrize( + "shapes", + [ + [[(1,), (3,)]], + [[(1, 3), (3, 3)]], + [[(3, 1), (3, 3)]], + [[(1, 3), (3, 1)]], + [[(1, 1), (3, 3)]], + [[(1, 1), (1, 3)]], + [[(1, 1), (3, 1)]], + [[(1, 0), (0, 0)]], + [[(0, 1), (0, 0)]], + [[(1, 0), (0, 1)]], + [[(1, 1), (0, 0)]], + [[(1, 1), (1, 0)]], + [[(1, 1), (0, 1)]], + ], +) +def test_broadcast_arrays_same_len_shapes(shapes): + # Check that two different input shapes of the same length, but some have + # ones, broadcast to the correct shape. + + for input_shapes in shapes: + assert_broadcast_correct(input_shapes) + assert_broadcast_correct(input_shapes[::-1]) + + +@pytest.mark.parametrize( + "shapes", + [ + [[(), (3,)]], + [[(3,), (3, 3)]], + [[(3,), (3, 1)]], + [[(1,), (3, 3)]], + [[(), (3, 3)]], + [[(1, 1), (3,)]], + [[(1,), (3, 1)]], + [[(1,), (1, 3)]], + [[(), (1, 3)]], + [[(), (3, 1)]], + [[(), (0,)]], + [[(0,), (0, 0)]], + [[(0,), (0, 1)]], + [[(1,), (0, 0)]], + [[(), (0, 0)]], + [[(1, 1), (0,)]], + [[(1,), (0, 1)]], + [[(1,), (1, 0)]], + [[(), (1, 0)]], + [[(), (0, 1)]], + ], +) +def test_broadcast_arrays_different_len_shapes(shapes): + # Check that two different input shapes (of different lengths) broadcast + # to the correct shape. + + for input_shapes in shapes: + assert_broadcast_correct(input_shapes) + assert_broadcast_correct(input_shapes[::-1]) + + +@pytest.mark.parametrize( + "shapes", + [ + [[(3,), (4,)]], + [[(2, 3), (2,)]], + [[(3,), (3,), (4,)]], + [[(1, 3, 4), (2, 3, 3)]], + ], +) +def test_incompatible_shapes_raise_valueerror(shapes): + for input_shapes in shapes: + assert_broadcast_arrays_raise(input_shapes) + assert_broadcast_arrays_raise(input_shapes[::-1]) + + +def test_broadcast_arrays_no_args(): + with pytest.raises(ValueError): + dpt.broadcast_arrays() + + +def test_flip_axis_incorrect(): + q = get_queue_or_skip() + + X_np = np.ones((4, 4)) + X = dpt.asarray(X_np, sycl_queue=q) + + pytest.raises(AxisError, dpt.flip, dpt.asarray(np.ones(4)), axis=1) + pytest.raises(AxisError, dpt.flip, X, axis=2) + pytest.raises(AxisError, dpt.flip, X, axis=-3) + pytest.raises(AxisError, dpt.flip, X, axis=(0, 3)) + + +def test_flip_0d(): + q = get_queue_or_skip() + + Xnp = np.array(1, dtype="int64") + X = dpt.asarray(Xnp, sycl_queue=q) + Ynp = np.flip(Xnp) + Y = dpt.flip(X) + assert_array_equal(Ynp, dpt.asnumpy(Y)) + + pytest.raises(AxisError, dpt.flip, X, axis=0) + pytest.raises(AxisError, dpt.flip, X, axis=1) + pytest.raises(AxisError, dpt.flip, X, axis=-1) + + +def test_flip_1d(): + q = get_queue_or_skip() + + Xnp = np.arange(6) + X = dpt.asarray(Xnp, sycl_queue=q) + + for ax in range(-X.ndim, X.ndim): + Ynp = np.flip(Xnp, axis=ax) + Y = dpt.flip(X, axis=ax) + assert_array_equal(Ynp, dpt.asnumpy(Y)) + + Ynp = np.flip(Xnp, axis=0) + Y = dpt.flip(X, axis=0) + assert_array_equal(Ynp, dpt.asnumpy(Y)) + + +@pytest.mark.parametrize( + "shapes", + [ + (3, 2), + (2, 3), + (2, 2), + (3, 3), + (3, 2, 3), + (2, 3, 2), + (2, 2, 2), + (3, 3, 3), + ], +) +def test_flip_2d_3d(shapes): + q = get_queue_or_skip() + + Xnp_size = np.prod(shapes) + Xnp = np.arange(Xnp_size).reshape(shapes) + X = dpt.asarray(Xnp, sycl_queue=q) + for ax in range(-X.ndim, X.ndim): + Y = dpt.flip(X, axis=ax) + Ynp = np.flip(Xnp, axis=ax) + assert_array_equal(Ynp, dpt.asnumpy(Y)) + + +@pytest.mark.parametrize( + "shapes", + [ + (1,), + (3,), + (2, 3), + (3, 2), + (2, 2), + (1, 2, 3), + (2, 1, 3), + (2, 3, 1), + (3, 2, 1), + (3, 3, 3), + ], +) +def test_flip_default_axes(shapes): + q = get_queue_or_skip() + + Xnp_size = np.prod(shapes) + Xnp = np.arange(Xnp_size).reshape(shapes) + X = dpt.asarray(Xnp, sycl_queue=q) + Y = dpt.flip(X) + Ynp = np.flip(Xnp) + assert_array_equal(Ynp, dpt.asnumpy(Y)) + + +@pytest.mark.parametrize( + "shapes", + [ + (0), + (1), + (1, 1), + (1, 0), + (0, 1), + (1, 1, 1), + (1, 0, 1), + (0, 1, 0), + ], +) +def test_flip_empty_0_size_dim(shapes): + q = get_queue_or_skip() + + X = dpt.empty(shapes, sycl_queue=q) + dpt.flip(X) + + +@pytest.mark.parametrize( + "data", + [ + [(2, 3), (0, 1)], + [(2, 3), (1, 0)], + [(2, 3), ()], + [(2, 1, 3), (0, 2)], + [(3, 1, 2), (2, 0)], + [(3, 3, 3), (2,)], + [(1, 2, 3), [0, -2]], + [(3, 1, 2), [-1, 0]], + [(3, 3, 3), [-2, -1]], + ], +) +def test_flip_multiple_axes(data): + q = get_queue_or_skip() + + shape, axes = data + Xnp_size = np.prod(shape) + Xnp = np.arange(Xnp_size).reshape(shape) + X = dpt.asarray(Xnp, sycl_queue=q) + Y = dpt.flip(X, axis=axes) + Ynp = np.flip(Xnp, axis=axes) + assert_array_equal(Ynp, dpt.asnumpy(Y)) + + +def test_roll_scalar(): + q = get_queue_or_skip() + + Xnp = np.ones([], dtype="f4") + X = dpt.asarray(Xnp, sycl_queue=q) + + Y = dpt.roll(X, 1) + Ynp = np.roll(Xnp, 1) + assert_array_equal(Ynp, dpt.asnumpy(Y)) + with pytest.raises(AxisError): + dpt.roll(X, 1, axis=0) + with pytest.raises(AxisError): + dpt.roll(X, 1, axis=1) + + +@pytest.mark.parametrize( + "data", + [ + [2, None], + [-2, None], + [2, 0], + [-2, 0], + [2, ()], + [11, 0], + ], +) +def test_roll_1d(data): + q = get_queue_or_skip() + + Xnp = np.arange(10) + X = dpt.asarray(Xnp, sycl_queue=q) + sh, ax = data + + Y = dpt.roll(X, sh, axis=ax) + Ynp = np.roll(Xnp, sh, axis=ax) + assert_array_equal(Ynp, dpt.asnumpy(Y)) + + Y = dpt.roll(X, sh, axis=ax) + Ynp = np.roll(Xnp, sh, axis=ax) + assert_array_equal(Ynp, dpt.asnumpy(Y)) + + +@pytest.mark.parametrize( + "data", + [ + [1, None], + [1, 0], + [1, 1], + [1, ()], + # Roll multiple axes at once + [1, (0, 1)], + [(1, 0), (0, 1)], + [(-1, 0), (1, 0)], + [(0, 1), (0, 1)], + [(0, -1), (0, 1)], + [(1, 1), (0, 1)], + [(-1, -1), (0, 1)], + # Roll the same axis multiple times. + [1, (0, 0)], + [1, (1, 1)], + # Roll more than one turn in either direction. + [6, 1], + [-4, 1], + ], +) +def test_roll_2d(data): + q = get_queue_or_skip() + + Xnp = np.arange(10).reshape(2, 5) + X = dpt.asarray(Xnp, sycl_queue=q) + sh, ax = data + + Y = dpt.roll(X, sh, axis=ax) + Ynp = np.roll(Xnp, sh, axis=ax) + assert_array_equal(Ynp, dpt.asnumpy(Y)) + + +def test_roll_out_bounds_shifts(): + "See gh-1857" + get_queue_or_skip() + + x = dpt.arange(4) + y = dpt.roll(x, np.uint64(2**63 + 2)) + expected = dpt.roll(x, 2) + assert dpt.all(y == expected) + + x_empty = x[1:1] + y = dpt.roll(x_empty, 11) + assert y.size == 0 + + x_2d = dpt.reshape(x, (2, 2)) + y = dpt.roll(x_2d, np.uint64(2**63 + 1), axis=1) + expected = dpt.roll(x_2d, 1, axis=1) + assert dpt.all(y == expected) + + x_2d_empty = x_2d[:, 1:1] + y = dpt.roll(x_2d_empty, 3, axis=1) + expected = dpt.empty_like(x_2d_empty) + assert dpt.all(y == expected) + + +def test_roll_validation(): + get_queue_or_skip() + + X = {} + with pytest.raises(TypeError): + dpt.roll(X) + + X = dpt.empty((1, 2, 3)) + shift = ((2, 3, 1), (1, 2, 3)) + with pytest.raises(ValueError): + dpt.roll(X, shift=shift, axis=(0, 1, 2)) + + +def test_concat_incorrect_type(): + Xnp = np.ones((2, 2)) + with pytest.raises(TypeError): + dpt.concat() + with pytest.raises(TypeError): + dpt.concat([]) + with pytest.raises(TypeError): + dpt.concat(Xnp) + with pytest.raises(TypeError): + dpt.concat([Xnp, Xnp]) + + +def test_concat_incorrect_queue(): + q1 = get_queue_or_skip() + q2 = get_queue_or_skip() + + X = dpt.ones((2, 2), sycl_queue=q1) + Y = dpt.ones((2, 2), sycl_queue=q2) + + pytest.raises(ValueError, dpt.concat, [X, Y]) + + +def test_concat_different_dtype(): + q = get_queue_or_skip() + + X = dpt.ones((2, 2), dtype=np.int64, sycl_queue=q) + Y = dpt.ones((3, 2), dtype=np.uint32, sycl_queue=q) + + XY = dpt.concat([X, Y]) + + assert XY.dtype is X.dtype + assert XY.shape == (5, 2) + assert XY.sycl_queue == q + + X1 = dpt.arange(10, dtype="i2", sycl_queue=q) + Y1 = dpt.arange(5, dtype="i4", sycl_queue=q) + + XY1 = dpt.concat([X1[::2], Y1[::-1]], axis=None) + assert XY1.shape == (10,) + assert XY1.sycl_queue == q + assert XY1.dtype == Y1.dtype + + +def test_concat_incorrect_ndim(): + q = get_queue_or_skip() + + X = dpt.ones((2, 2), sycl_queue=q) + Y = dpt.ones((2, 2, 2), sycl_queue=q) + + pytest.raises(ValueError, dpt.concat, [X, Y]) + + +@pytest.mark.parametrize( + "data", + [ + [(2, 2), (3, 3), 0], + [(2, 2), (3, 3), 1], + [(3, 2), (3, 3), 0], + [(2, 3), (3, 3), 1], + ], +) +def test_concat_incorrect_shape(data): + q = get_queue_or_skip() + + Xshape, Yshape, axis = data + + X = dpt.ones(Xshape, sycl_queue=q) + Y = dpt.ones(Yshape, sycl_queue=q) + + pytest.raises(ValueError, dpt.concat, [X, Y], axis=axis) + + +@pytest.mark.parametrize( + "data", + [ + [(6,), 0], + [(2, 3), 1], + [(3, 2), -1], + [(1, 6), 0], + [(2, 1, 3), 2], + ], +) +def test_concat_1array(data): + q = get_queue_or_skip() + + Xshape, axis = data + + Xnp = np.arange(6).reshape(Xshape) + X = dpt.asarray(Xnp, sycl_queue=q) + + Ynp = np.concatenate([Xnp], axis=axis) + Y = dpt.concat([X], axis=axis) + + assert_array_equal(Ynp, dpt.asnumpy(Y)) + + Ynp = np.concatenate((Xnp,), axis=axis) + Y = dpt.concat((X,), axis=axis) + + assert_array_equal(Ynp, dpt.asnumpy(Y)) + + +@pytest.mark.parametrize( + "data", + [ + [(1,), (1,), 0], + [(0, 2), (0, 2), 1], + [(0, 2), (2, 2), 0], + [(2, 1), (2, 2), -1], + [(2, 2, 2), (2, 1, 2), 1], + [(3, 3, 3), (2, 2), None], + ], +) +def test_concat_2arrays(data): + q = get_queue_or_skip() + + Xshape, Yshape, axis = data + + Xnp = np.ones(Xshape) + X = dpt.asarray(Xnp, sycl_queue=q) + + Ynp = np.zeros(Yshape) + Y = dpt.asarray(Ynp, sycl_queue=q) + + Znp = np.concatenate([Xnp, Ynp], axis=axis) + Z = dpt.concat([X, Y], axis=axis) + + assert_array_equal(Znp, dpt.asnumpy(Z)) + + +@pytest.mark.parametrize( + "data", + [ + [(1,), (1,), (1,), 0], + [(0, 2), (2, 2), (1, 2), 0], + [(2, 1, 2), (2, 2, 2), (2, 4, 2), 1], + ], +) +def test_concat_3arrays(data): + q = get_queue_or_skip() + + Xshape, Yshape, Zshape, axis = data + + Xnp = np.ones(Xshape) + X = dpt.asarray(Xnp, sycl_queue=q) + + Ynp = np.zeros(Yshape) + Y = dpt.asarray(Ynp, sycl_queue=q) + + Znp = np.full(Zshape, 2.0) + Z = dpt.asarray(Znp, sycl_queue=q) + + Rnp = np.concatenate([Xnp, Ynp, Znp], axis=axis) + R = dpt.concat([X, Y, Z], axis=axis) + + assert_array_equal(Rnp, dpt.asnumpy(R)) + + +def test_concat_axis_none_strides(): + q = get_queue_or_skip() + Xnp = np.arange(0, 18).reshape((6, 3)) + X = dpt.asarray(Xnp, sycl_queue=q) + + Ynp = np.arange(20, 36).reshape((4, 2, 2)) + Y = dpt.asarray(Ynp, sycl_queue=q) + + Znp = np.concatenate([Xnp[::2], Ynp[::2]], axis=None) + Z = dpt.concat([X[::2], Y[::2]], axis=None) + + assert_array_equal(Znp, dpt.asnumpy(Z)) + + +def test_stack_incorrect_shape(): + q = get_queue_or_skip() + + X = dpt.ones((1,), sycl_queue=q) + Y = dpt.ones((2,), sycl_queue=q) + + with pytest.raises(ValueError): + dpt.stack([X, Y], axis=0) + + +@pytest.mark.parametrize( + "data", + [ + [(6,), 0], + [(2, 3), 1], + [(3, 2), -1], + [(1, 6), 2], + [(2, 1, 3), 2], + ], +) +def test_stack_1array(data): + q = get_queue_or_skip() + + shape, axis = data + + Xnp = np.arange(6).reshape(shape) + X = dpt.asarray(Xnp, sycl_queue=q) + + Ynp = np.stack([Xnp], axis=axis) + Y = dpt.stack([X], axis=axis) + + assert_array_equal(Ynp, dpt.asnumpy(Y)) + + Ynp = np.stack((Xnp,), axis=axis) + Y = dpt.stack((X,), axis=axis) + + assert_array_equal(Ynp, dpt.asnumpy(Y)) + + +@pytest.mark.parametrize( + "data", + [ + [(1,), 0], + [(0, 2), 0], + [(2, 0), 0], + [(2, 3), 0], + [(2, 3), 1], + [(2, 3), 2], + [(2, 3), -1], + [(2, 3), -2], + [(2, 2, 2), 1], + ], +) +def test_stack_2arrays(data): + q = get_queue_or_skip() + + shape, axis = data + + Xnp = np.ones(shape) + X = dpt.asarray(Xnp, sycl_queue=q) + + Ynp = np.zeros(shape) + Y = dpt.asarray(Ynp, sycl_queue=q) + + Znp = np.stack([Xnp, Ynp], axis=axis) + Z = dpt.stack([X, Y], axis=axis) + + assert_array_equal(Znp, dpt.asnumpy(Z)) + + +@pytest.mark.parametrize( + "data", + [ + [(1,), 0], + [(0, 2), 0], + [(2, 1, 2), 1], + ], +) +def test_stack_3arrays(data): + q = get_queue_or_skip() + + shape, axis = data + + Xnp = np.ones(shape) + X = dpt.asarray(Xnp, sycl_queue=q) + + Ynp = np.zeros(shape) + Y = dpt.asarray(Ynp, sycl_queue=q) + + Znp = np.full(shape, 2.0) + Z = dpt.asarray(Znp, sycl_queue=q) + + Rnp = np.stack([Xnp, Ynp, Znp], axis=axis) + R = dpt.stack([X, Y, Z], axis=axis) + + assert_array_equal(Rnp, dpt.asnumpy(R)) + + +def test_can_cast(): + q = get_queue_or_skip() + + # incorrect input + X = dpt.ones((2, 2), dtype=dpt.int16, sycl_queue=q) + pytest.raises(TypeError, dpt.can_cast, X, 1) + pytest.raises(TypeError, dpt.can_cast, X, X) + X_np = np.ones((2, 2), dtype=np.int16) + + assert dpt.can_cast(X, "float32") == np.can_cast(X_np, "float32") + assert dpt.can_cast(X, dpt.int32) == np.can_cast(X_np, np.int32) + assert dpt.can_cast(X, dpt.int64) == np.can_cast(X_np, np.int64) + + +def test_result_type(): + q = get_queue_or_skip() + + usm_ar = dpt.ones((2), dtype=dpt.int16, sycl_queue=q) + np_ar = dpt.asnumpy(usm_ar) + + X = [usm_ar, dpt.int32, "int64", usm_ar] + X_np = [np_ar, np.int32, "int64", np_ar] + + assert dpt.result_type(*X) == np.result_type(*X_np) + + X = [usm_ar, dpt.int32, "int64", True] + X_np = [np_ar, np.int32, "int64", True] + + assert dpt.result_type(*X) == np.result_type(*X_np) + + X = [usm_ar, dpt.int32, "int64", 2] + X_np = [np_ar, np.int32, "int64", 2] + + assert dpt.result_type(*X) == np.result_type(*X_np) + + X = [dpt.int32, "int64", 2] + X_np = [np.int32, "int64", 2] + + assert dpt.result_type(*X) == np.result_type(*X_np) + + X = [usm_ar, dpt.int32, "int64", 2.0] + X_np = [np_ar, np.int32, "int64", 2.0] + + assert dpt.result_type(*X).kind == np.result_type(*X_np).kind + + X = [usm_ar, dpt.int32, "int64", 2.0 + 1j] + X_np = [np_ar, np.int32, "int64", 2.0 + 1j] + + assert dpt.result_type(*X).kind == np.result_type(*X_np).kind + + +def test_swapaxes_1d(): + get_queue_or_skip() + x = np.array([[1, 2, 3]]) + exp = np.swapaxes(x, 0, 1) + + y = dpt.asarray([[1, 2, 3]]) + res = dpt.swapaxes(y, 0, 1) + + assert_array_equal(exp, dpt.asnumpy(res)) + + +def test_swapaxes_2d(): + get_queue_or_skip() + x = np.array([[[0, 1], [2, 3]], [[4, 5], [6, 7]]]) + exp = np.swapaxes(x, 0, 2) + + y = dpt.asarray([[[0, 1], [2, 3]], [[4, 5], [6, 7]]]) + res = dpt.swapaxes(y, 0, 2) + + assert_array_equal(exp, dpt.asnumpy(res)) + + +@pytest.mark.parametrize( + "source, expected", + [ + (0, (6, 7, 5)), + (1, (5, 7, 6)), + (2, (5, 6, 7)), + (-1, (5, 6, 7)), + ], +) +def test_moveaxis_move_to_end(source, expected): + get_queue_or_skip() + x = dpt.reshape(dpt.arange(5 * 6 * 7), (5, 6, 7)) + actual = dpt.moveaxis(x, source, -1).shape + assert_(actual, expected) + + +@pytest.mark.parametrize( + "source, destination, expected", + [ + (0, 1, (2, 1, 3, 4)), + (1, 2, (1, 3, 2, 4)), + (1, -1, (1, 3, 4, 2)), + ], +) +def test_moveaxis_new_position(source, destination, expected): + get_queue_or_skip() + x = dpt.reshape(dpt.arange(24), (1, 2, 3, 4)) + actual = dpt.moveaxis(x, source, destination).shape + assert_(actual, expected) + + +@pytest.mark.parametrize( + "source, destination", + [ + (0, 0), + (3, -1), + (-1, 3), + ([0, -1], [0, -1]), + ([2, 0], [2, 0]), + ], +) +def test_moveaxis_preserve_order(source, destination): + get_queue_or_skip() + x = dpt.zeros((1, 2, 3, 4)) + actual = dpt.moveaxis(x, source, destination).shape + assert_(actual, (1, 2, 3, 4)) + + +@pytest.mark.parametrize( + "shape, source, destination, expected", + [ + ((0, 1, 2, 3), [0, 1], [2, 3], (2, 3, 0, 1)), + ((0, 1, 2, 3), [2, 3], [0, 1], (2, 3, 0, 1)), + ((0, 1, 2, 3), [0, 1, 2], [2, 3, 0], (2, 3, 0, 1)), + ((0, 1, 2, 3), [3, 0], [1, 0], (0, 3, 1, 2)), + ((0, 1, 2, 3), [0, 3], [0, 1], (0, 3, 1, 2)), + ((1, 2, 3, 4), range(4), range(4), (1, 2, 3, 4)), + ], +) +def test_moveaxis_move_multiples(shape, source, destination, expected): + get_queue_or_skip() + x = dpt.zeros(shape) + y = dpt.moveaxis(x, source, destination) + actual = y.shape + assert_(actual, expected) + assert y._pointer == x._pointer + + +def test_moveaxis_errors(): + try: + x_flat = dpt.arange(6) + except dpctl.SyclDeviceCreationError: + pytest.skip("No SYCL devices available") + x = dpt.reshape(x_flat, (1, 2, 3)) + assert_raises_regex( + AxisError, "source.*out of bounds", dpt.moveaxis, x, 3, 0 + ) + assert_raises_regex( + AxisError, "source.*out of bounds", dpt.moveaxis, x, -4, 0 + ) + assert_raises_regex( + AxisError, "destination.*out of bounds", dpt.moveaxis, x, 0, 5 + ) + assert_raises_regex( + ValueError, "repeated axis in `source`", dpt.moveaxis, x, [0, 0], [0, 1] + ) + assert_raises_regex( + ValueError, + "repeated axis in `destination`", + dpt.moveaxis, + x, + [0, 1], + [1, 1], + ) + assert_raises_regex( + ValueError, "must have the same number", dpt.moveaxis, x, 0, [0, 1] + ) + assert_raises_regex( + ValueError, "must have the same number", dpt.moveaxis, x, [0, 1], [0] + ) + + +def test_unstack_axis0(): + try: + x_flat = dpt.arange(6) + except dpctl.SyclDeviceCreationError: + pytest.skip("No SYCL devices available") + y = dpt.reshape(x_flat, (2, 3)) + res = dpt.unstack(y) + + assert_array_equal(dpt.asnumpy(y[0, ...]), dpt.asnumpy(res[0])) + assert_array_equal(dpt.asnumpy(y[1, ...]), dpt.asnumpy(res[1])) + + +def test_unstack_axis1(): + try: + x_flat = dpt.arange(6) + except dpctl.SyclDeviceCreationError: + pytest.skip("No SYCL devices available") + y = dpt.reshape(x_flat, (2, 3)) + res = dpt.unstack(y, axis=1) + + assert_array_equal(dpt.asnumpy(y[:, 0, ...]), dpt.asnumpy(res[0])) + assert_array_equal(dpt.asnumpy(y[:, 1, ...]), dpt.asnumpy(res[1])) + assert_array_equal(dpt.asnumpy(y[:, 2, ...]), dpt.asnumpy(res[2])) + + +def test_unstack_axis2(): + try: + x_flat = dpt.arange(60) + except dpctl.SyclDeviceCreationError: + pytest.skip("No SYCL devices available") + y = dpt.reshape(x_flat, (4, 5, 3)) + res = dpt.unstack(y, axis=2) + + assert_array_equal(dpt.asnumpy(y[:, :, 0, ...]), dpt.asnumpy(res[0])) + assert_array_equal(dpt.asnumpy(y[:, :, 1, ...]), dpt.asnumpy(res[1])) + assert_array_equal(dpt.asnumpy(y[:, :, 2, ...]), dpt.asnumpy(res[2])) + + +def test_finfo_object(): + fi = dpt.finfo(dpt.float32) + assert isinstance(fi.bits, int) + assert isinstance(fi.max, float) + assert isinstance(fi.min, float) + assert isinstance(fi.eps, float) + assert isinstance(fi.epsneg, float) + assert isinstance(fi.smallest_normal, float) + assert isinstance(fi.tiny, float) + assert isinstance(fi.precision, float) + assert isinstance(fi.resolution, float) + assert isinstance(fi.dtype, dpt.dtype) + assert isinstance(str(fi), str) + assert isinstance(repr(fi), str) + + +def test_repeat_scalar_sequence_agreement(): + get_queue_or_skip() + + x = dpt.arange(5, dtype="i4") + expected_res = dpt.empty(10, dtype="i4") + expected_res[1::2], expected_res[::2] = x, x + + # scalar case + reps = 2 + res = dpt.repeat(x, reps) + assert dpt.all(res == expected_res) + + # tuple + reps = (2, 2, 2, 2, 2) + res = dpt.repeat(x, reps) + assert dpt.all(res == expected_res) + + +def test_repeat_as_broadcasting(): + get_queue_or_skip() + + reps = 5 + x = dpt.arange(reps, dtype="i4") + x1 = x[:, dpt.newaxis] + expected_res = dpt.broadcast_to(x1, (reps, reps)) + + res = dpt.repeat(x1, reps, axis=1) + assert dpt.all(res == expected_res) + + x2 = x[dpt.newaxis, :] + expected_res = dpt.broadcast_to(x2, (reps, reps)) + + res = dpt.repeat(x2, reps, axis=0) + assert dpt.all(res == expected_res) + + +def test_repeat_axes(): + get_queue_or_skip() + + reps = 2 + x = dpt.reshape(dpt.arange(5 * 10, dtype="i4"), (5, 10)) + expected_res = dpt.empty((x.shape[0] * 2, x.shape[1]), dtype=x.dtype) + expected_res[::2, :], expected_res[1::2] = x, x + res = dpt.repeat(x, reps, axis=0) + assert dpt.all(res == expected_res) + + expected_res = dpt.empty((x.shape[0], x.shape[1] * 2), dtype=x.dtype) + expected_res[:, ::2], expected_res[:, 1::2] = x, x + res = dpt.repeat(x, reps, axis=1) + assert dpt.all(res == expected_res) + + x = dpt.arange(10, dtype="i4") + expected_res = dpt.empty(x.shape[0] * reps, dtype=x.dtype) + expected_res[::2], expected_res[1::2] = x, x + res = dpt.repeat(x, reps, axis=0) + assert dpt.all(res == expected_res) + + +def test_repeat_size_0_outputs(): + get_queue_or_skip() + + x = dpt.ones((3, 0, 5), dtype="i4") + reps = 10 + res = dpt.repeat(x, reps, axis=0) + assert res.size == 0 + assert res.shape == (30, 0, 5) + + res = dpt.repeat(x, reps, axis=1) + assert res.size == 0 + assert res.shape == (3, 0, 5) + + res = dpt.repeat(x, (2, 2, 2), axis=0) + assert res.size == 0 + assert res.shape == (6, 0, 5) + + x = dpt.ones((3, 2, 5)) + res = dpt.repeat(x, 0, axis=1) + assert res.size == 0 + assert res.shape == (3, 0, 5) + + res = dpt.repeat(x, (0, 0), axis=1) + assert res.size == 0 + assert res.shape == (3, 0, 5) + + # axis=None cases + res = dpt.repeat(x, 0) + assert res.size == 0 + + res = dpt.repeat(x, (0,) * x.size) + assert res.size == 0 + + +def test_repeat_strides(): + get_queue_or_skip() + + reps = 2 + x = dpt.reshape(dpt.arange(10 * 10, dtype="i4"), (10, 10)) + x1 = x[:, ::-2] + expected_res = dpt.empty((10, 10), dtype="i4") + expected_res[:, ::2], expected_res[:, 1::2] = x1, x1 + res = dpt.repeat(x1, reps, axis=1) + assert dpt.all(res == expected_res) + res = dpt.repeat(x1, (reps,) * x1.shape[1], axis=1) + assert dpt.all(res == expected_res) + + x1 = x[::-2, :] + expected_res = dpt.empty((10, 10), dtype="i4") + expected_res[::2, :], expected_res[1::2, :] = x1, x1 + res = dpt.repeat(x1, reps, axis=0) + assert dpt.all(res == expected_res) + res = dpt.repeat(x1, (reps,) * x1.shape[0], axis=0) + assert dpt.all(res == expected_res) + + # axis=None + x = dpt.reshape(dpt.arange(10 * 10), (10, 10)) + x1 = dpt.reshape(x[::-2, :], -1) + x2 = x[::-2, :] + expected_res = dpt.empty(10 * 10, dtype="i4") + expected_res[::2], expected_res[1::2] = x1, x1 + res = dpt.repeat(x2, reps) + assert dpt.all(res == expected_res) + res = dpt.repeat(x2, (reps,) * x1.size) + assert dpt.all(res == expected_res) + + +def test_repeat_casting(): + get_queue_or_skip() + + x = dpt.arange(5, dtype="i4") + # i4 is cast to i8 + reps = dpt.ones(5, dtype="i4") + res = dpt.repeat(x, reps) + assert res.shape == x.shape + assert dpt.all(res == x) + + +def test_repeat_strided_repeats(): + get_queue_or_skip() + + x = dpt.arange(5, dtype="i4") + reps = dpt.ones(10, dtype="i8") + reps[::2] = 0 + reps = reps[::-2] + res = dpt.repeat(x, reps) + assert res.shape == x.shape + assert dpt.all(res == x) + + +def test_repeat_size1_repeats(): + get_queue_or_skip() + + x = dpt.arange(5, dtype="i4") + expected_res = dpt.repeat(x, 2) + # 0D repeats + reps_0d = dpt.asarray(2, dtype="i8") + res = dpt.repeat(x, reps_0d) + assert dpt.all(res == expected_res) + # 1D repeats + reps_1d = dpt.asarray([2], dtype="i8") + res = dpt.repeat(x, reps_1d) + assert dpt.all(res == expected_res) + + +def test_repeat_arg_validation(): + get_queue_or_skip() + + x = {} + with pytest.raises(TypeError): + dpt.repeat(x, 2) + + # axis must be 0 for scalar + x = dpt.empty(()) + with pytest.raises(ValueError): + dpt.repeat(x, 2, axis=1) + + # repeats must be positive + x = dpt.empty(5) + with pytest.raises(ValueError): + dpt.repeat(x, -2) + + # repeats must be integers + with pytest.raises(TypeError): + dpt.repeat(x, 2.0) + + # repeats tuple must be the same length as axis + with pytest.raises(ValueError): + dpt.repeat(x, (1, 2)) + + # repeats tuple elements must be positive + with pytest.raises(ValueError): + dpt.repeat(x, (-1,)) + + # repeats must be int or tuple + with pytest.raises(TypeError): + dpt.repeat(x, dict()) + + # repeats array must be 0d or 1d + with pytest.raises(ValueError): + dpt.repeat(x, dpt.ones((1, 1), dtype="i8")) + + # repeats must be castable to i8 + with pytest.raises(TypeError): + dpt.repeat(x, dpt.asarray(2.0, dtype="f4")) + + # compute follows data + q2 = dpctl.SyclQueue() + reps = dpt.asarray(1, dtype="i8", sycl_queue=q2) + with pytest.raises(dpt.ExecutionPlacementError): + dpt.repeat(x, reps) + + # repeats array must not contain negative elements + reps = dpt.asarray(-1, dtype="i8") + with pytest.raises(ValueError): + dpt.repeat(x, reps) + reps = dpt.asarray([1, 1, 1, 1, -1], dtype="i8") + with pytest.raises(ValueError): + dpt.repeat(x, reps) + + # repeats must broadcastable to axis size + reps = dpt.arange(10, dtype="i8") + with pytest.raises(ValueError): + dpt.repeat(x, reps) + + +def test_tile_basic(): + get_queue_or_skip() + + reps = 2 + x = dpt.arange(5, dtype="i4") + res = dpt.tile(x, reps) + assert res.shape == (x.shape[0] * reps,) + assert dpt.all(res[: x.size] == res[x.size :]) + + reps = (2, 1) + expected_sh = (2, x.shape[0]) + expected_res = dpt.broadcast_to(x, expected_sh) + res = dpt.tile(x, reps) + assert res.shape == expected_sh + assert dpt.all(expected_res == res) + + +def test_tile_size_1(): + get_queue_or_skip() + + reps = 5 + # test for 0d array + x1 = dpt.asarray(2, dtype="i4") + res = dpt.tile(x1, reps) + assert dpt.all(res == dpt.full(reps, 2, dtype="i4")) + + # test for 1d array with single element + x2 = dpt.asarray([2], dtype="i4") + res = dpt.tile(x2, reps) + assert dpt.all(res == dpt.full(reps, 2, dtype="i4")) + + reps = () + # test for gh-1627 behavior + res = dpt.tile(x1, reps) + assert x1.shape == res.shape + assert_array_equal(dpt.asnumpy(x1), dpt.asnumpy(res)) + + res = dpt.tile(x2, reps) + assert x2.shape == res.shape + assert_array_equal(dpt.asnumpy(x2), dpt.asnumpy(res)) + + +def test_tile_prepends_axes(): + get_queue_or_skip() + + reps = (2,) + x = dpt.ones((5, 10), dtype="i4") + expected_res = dpt.ones((5, 20), dtype="i4") + res = dpt.tile(x, reps) + assert dpt.all(res == expected_res) + + reps = (3, 2, 2) + expected_res = dpt.ones((3, 10, 20), dtype="i4") + res = dpt.tile(x, reps) + assert dpt.all(res == expected_res) + + +def test_tile_empty_outputs(): + get_queue_or_skip() + + x = dpt.asarray((), dtype="i4") + reps = 10 + res = dpt.tile(x, reps) + assert res.size == 0 + assert res.shape == (0,) + + x = dpt.ones((3, 0, 5), dtype="i4") + res = dpt.tile(x, reps) + assert res.size == 0 + assert res.shape == (3, 0, 50) + + reps = (2, 1, 2) + res = dpt.tile(x, reps) + assert res.size == 0 + assert res.shape == (6, 0, 10) + + x = dpt.ones((2, 3, 4), dtype="i4") + reps = (0, 1, 1) + res = dpt.tile(x, reps) + assert res.size == 0 + assert res.shape == (0, 3, 4) + + +def test_tile_strides(): + get_queue_or_skip() + + reps = (1, 2) + x = dpt.reshape(dpt.arange(10 * 10, dtype="i4"), (10, 10)) + x1 = x[:, ::-2] + expected_res = dpt.empty((10, 10), dtype="i4") + expected_res[:, : x1.shape[1]], expected_res[:, x1.shape[1] :] = x1, x1 + res = dpt.tile(x1, reps) + assert dpt.all(res == expected_res) + + reps = (2, 1) + x1 = x[::-2, :] + expected_res = dpt.empty((10, 10), dtype="i4") + expected_res[: x1.shape[0], :], expected_res[x1.shape[0] :, :] = x1, x1 + res = dpt.tile(x1, reps) + assert dpt.all(res == expected_res) + + +def test_tile_size_1_axes(): + get_queue_or_skip() + + reps = (1, 2, 1) + x = dpt.ones((2, 1, 3), dtype="i4") + res = dpt.tile(x, reps) + expected_res = dpt.broadcast_to(x, (2, 2, 3)) + assert dpt.all(res == expected_res) + + +def test_tile_arg_validation(): + get_queue_or_skip() + + with pytest.raises(TypeError): + dpt.tile(dict(), 2) + + # repetitions must be int or tuple + x = dpt.empty(()) + with pytest.raises(TypeError): + dpt.tile(x, dict()) + + +def test_repeat_0_size(): + get_queue_or_skip() + + x = dpt.ones((0, 10, 0), dtype="i4") + repetitions = 2 + res = dpt.repeat(x, repetitions) + assert res.shape == (0,) + res = dpt.repeat(x, repetitions, axis=2) + assert res.shape == x.shape + res = dpt.repeat(x, repetitions, axis=1) + axis_sz = x.shape[1] * repetitions + assert res.shape == (0, 20, 0) + + repetitions = dpt.asarray(2, dtype="i4") + res = dpt.repeat(x, repetitions) + assert res.shape == (0,) + res = dpt.repeat(x, repetitions, axis=2) + assert res.shape == x.shape + res = dpt.repeat(x, repetitions, axis=1) + assert res.shape == (0, 20, 0) + + repetitions = dpt.arange(10, dtype="i4") + res = dpt.repeat(x, repetitions, axis=1) + axis_sz = dpt.sum(repetitions) + assert res.shape == (0, axis_sz, 0) + + repetitions = (2,) * 10 + res = dpt.repeat(x, repetitions, axis=1) + axis_sz = 2 * x.shape[1] + assert res.shape == (0, axis_sz, 0) + + +def test_result_type_bug_1874(): + py_sc = True + np_sc = np.asarray([py_sc])[0] + dts_bool = [py_sc, np_sc] + py_sc = int(1) + np_sc = np.asarray([py_sc])[0] + dts_ints = [py_sc, np_sc] + dts_floats = [float(1), np.float64(1)] + dts_complexes = [complex(1), np.complex128(1)] + + # iterate over two categories + for dts1, dts2 in itertools.product( + [dts_bool, dts_ints, dts_floats, dts_complexes], repeat=2 + ): + res_dts = [] + # iterate over Python scalar/NumPy scalar choices within categories + for dt1, dt2 in itertools.product(dts1, dts2): + res_dt = dpt.result_type(dt1, dt2) + res_dts.append(res_dt) + # check that all results are the same + assert res_dts and all(res_dts[0] == el for el in res_dts[1:]) diff --git a/dpnp/tests/tensor/test_usm_ndarray_operators.py b/dpnp/tests/tensor/test_usm_ndarray_operators.py new file mode 100644 index 000000000000..8ac178def197 --- /dev/null +++ b/dpnp/tests/tensor/test_usm_ndarray_operators.py @@ -0,0 +1,154 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import dpctl +import pytest + +import dpnp.tensor as dpt + + +class Dummy: + @staticmethod + def abs(a): + return a + + @staticmethod + def add(a, b): + if isinstance(a, dpt.usm_ndarray): + return a + else: + return b + + @staticmethod + def subtract(a, b): + if isinstance(a, dpt.usm_ndarray): + return a + else: + return b + + @staticmethod + def multiply(a, b): + if isinstance(a, dpt.usm_ndarray): + return a + else: + return b + + +@pytest.mark.parametrize("namespace", [dpt, Dummy()]) +def test_fp_ops(namespace): + try: + X = dpt.ones(1) + except dpctl.SyclDeviceCreationError: + pytest.skip("No SYCL devices available") + X._set_namespace(namespace) + assert X.__array_namespace__() is namespace + X[0] = -2.5 + X.__abs__() + X.__add__(1.0) + X.__radd__(1.0) + X.__sub__(1.0) + X.__rsub__(1.0) + X.__mul__(1.0) + X.__rmul__(1.0) + X.__truediv__(1.0) + X.__rtruediv__(1.0) + X.__floordiv__(1.0) + X.__rfloordiv__(1.0) + X.__pos__() + X.__neg__() + X.__eq__(-2.5) + X.__ne__(-2.5) + X.__le__(-2.5) + X.__ge__(-2.5) + X.__gt__(-2.0) + X.__iadd__(X) + X.__isub__(X) + X.__imul__(X) + X.__itruediv__(1.0) + X.__ifloordiv__(1.0) + + +@pytest.mark.parametrize("namespace", [dpt, Dummy()]) +def test_int_ops(namespace): + try: + X = dpt.usm_ndarray(1, "i4") + except dpctl.SyclDeviceCreationError: + pytest.skip("No SYCL devices available") + X._set_namespace(namespace) + assert X.__array_namespace__() is namespace + X.__lshift__(2) + X.__rshift__(2) + X.__rlshift__(2) + X.__rrshift__(2) + X.__ilshift__(2) + X.__irshift__(2) + X.__and__(X) + X.__rand__(X) + X.__iand__(X) + X.__or__(X) + X.__ror__(X) + X.__ior__(X) + X.__xor__(X) + X.__rxor__(X) + X.__ixor__(X) + X.__invert__() + X.__mod__(5) + X.__rmod__(5) + X.__imod__(5) + X.__pow__(2) + X.__rpow__(2) + X.__ipow__(2) + + +@pytest.mark.parametrize("namespace", [dpt, Dummy()]) +def test_mat_ops(namespace): + try: + M = dpt.eye(3, 3) + except dpctl.SyclDeviceCreationError: + pytest.skip("No SYCL devices available") + M._set_namespace(namespace) + assert M.__array_namespace__() is namespace + M.__matmul__(M) + M.__imatmul__(M) + M.__rmatmul__(M) + + +@pytest.mark.parametrize("namespace", [dpt, Dummy()]) +def test_comp_ops(namespace): + try: + X = dpt.asarray(1, dtype="u8") + except dpctl.SyclDeviceCreationError: + pytest.skip("No SYCL devices available") + X._set_namespace(namespace) + assert X.__array_namespace__() is namespace + assert X.__gt__(-1) + assert X.__ge__(-1) + assert not X.__lt__(-1) + assert not X.__le__(-1) + assert not X.__eq__(-1) + assert X.__ne__(-1) diff --git a/dpnp/tests/tensor/test_usm_ndarray_print.py b/dpnp/tests/tensor/test_usm_ndarray_print.py new file mode 100644 index 000000000000..94dbfca7c198 --- /dev/null +++ b/dpnp/tests/tensor/test_usm_ndarray_print.py @@ -0,0 +1,408 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import dpctl +import numpy as np +import pytest + +import dpnp.tensor as dpt + +from .helper import ( + get_queue_or_skip, + skip_if_dtype_not_supported, +) + + +class TestPrint: + def setup_method(self): + self._retain_options = dpt.get_print_options() + + def teardown_method(self): + dpt.set_print_options(**self._retain_options) + + +class TestArgValidation(TestPrint): + @pytest.mark.parametrize( + "arg,err", + [ + ({"linewidth": "I"}, TypeError), + ({"edgeitems": "I"}, TypeError), + ({"threshold": "I"}, TypeError), + ({"precision": "I"}, TypeError), + ({"floatmode": "I"}, ValueError), + ({"edgeitems": "I"}, TypeError), + ({"sign": "I"}, ValueError), + ({"nanstr": np.nan}, TypeError), + ({"infstr": np.nan}, TypeError), + ], + ) + def test_print_option_arg_validation(self, arg, err): + with pytest.raises(err): + dpt.set_print_options(**arg) + + def test_usm_ndarray_repr_arg_validation(self): + X = {} + with pytest.raises(TypeError): + dpt.usm_ndarray_repr(X) + + try: + X = dpt.arange(4) + except dpctl.SyclDeviceCreationError: + pytest.skip("No SYCL devices available") + with pytest.raises(TypeError): + dpt.usm_ndarray_repr(X, line_width="I") + + with pytest.raises(TypeError): + dpt.usm_ndarray_repr(X, precision="I") + + with pytest.raises(TypeError): + dpt.usm_ndarray_repr(X, prefix=4) + + def test_usm_ndarray_str_arg_validation(self): + X = {} + with pytest.raises(TypeError): + dpt.usm_ndarray_str(X) + + try: + X = dpt.arange(4) + except dpctl.SyclDeviceCreationError: + pytest.skip("No SYCL devices available") + + with pytest.raises(TypeError): + dpt.usm_ndarray_str(X, line_width="I") + + with pytest.raises(TypeError): + dpt.usm_ndarray_str(X, edge_items="I") + + with pytest.raises(TypeError): + dpt.usm_ndarray_str(X, threshold="I") + + with pytest.raises(TypeError): + dpt.usm_ndarray_str(X, precision="I") + + with pytest.raises(ValueError): + dpt.usm_ndarray_str(X, floatmode="I") + + with pytest.raises(TypeError): + dpt.usm_ndarray_str(X, edge_items="I") + + with pytest.raises(ValueError): + dpt.usm_ndarray_str(X, sign="I") + + with pytest.raises(TypeError): + dpt.usm_ndarray_str(X, prefix=4) + + with pytest.raises(TypeError): + dpt.usm_ndarray_str(X, prefix=4) + + with pytest.raises(TypeError): + dpt.usm_ndarray_str(X, suffix=4) + + +class TestSetPrintOptions(TestPrint): + def test_set_linewidth(self): + q = get_queue_or_skip() + + dpt.set_print_options(linewidth=1) + x = dpt.asarray([0, 1], sycl_queue=q) + assert str(x) == "[0\n 1]" + + def test_set_precision(self): + q = get_queue_or_skip() + + dpt.set_print_options(precision=4) + x = dpt.asarray([1.23450], sycl_queue=q) + assert str(x) == "[1.2345]" + + def test_threshold_edgeitems(self): + q = get_queue_or_skip() + + dpt.set_print_options(threshold=1, edgeitems=1) + x = dpt.arange(9, sycl_queue=q) + assert str(x) == "[0 ... 8]" + dpt.set_print_options(edgeitems=9) + assert str(x) == "[0 1 2 3 4 5 6 7 8]" + + def test_floatmodes(self): + q = get_queue_or_skip() + + x = dpt.asarray([0.1234, 0.1234678], sycl_queue=q) + dpt.set_print_options(floatmode="fixed", precision=4) + assert str(x) == "[0.1234 0.1235]" + + dpt.set_print_options(floatmode="unique") + assert str(x) == "[0.1234 0.1234678]" + + dpt.set_print_options(floatmode="maxprec") + assert str(x) == "[0.1234 0.1235]" + + dpt.set_print_options(floatmode="maxprec", precision=8) + assert str(x) == "[0.1234 0.1234678]" + + dpt.set_print_options(floatmode="maxprec_equal", precision=4) + assert str(x) == "[0.1234 0.1235]" + + dpt.set_print_options(floatmode="maxprec_equal", precision=8) + assert str(x) == "[0.1234000 0.1234678]" + + def test_nan_inf_suppress(self): + q = get_queue_or_skip() + + dpt.set_print_options(nanstr="nan1", infstr="inf1") + x = dpt.asarray([np.nan, np.inf], sycl_queue=q) + assert str(x) == "[nan1 inf1]" + + def test_suppress_small(self): + q = get_queue_or_skip() + + dpt.set_print_options(suppress=True) + x = dpt.asarray(5e-10, sycl_queue=q) + assert str(x) == "0." + + def test_sign(self): + q = get_queue_or_skip() + + x = dpt.asarray([0.0, 1.0, 2.0], sycl_queue=q) + y = dpt.asarray(1.0, sycl_queue=q) + z = dpt.asarray([1.0 + 1.0j], sycl_queue=q) + assert str(x) == "[0. 1. 2.]" + assert str(y) == "1." + assert str(z) == "[1.+1.j]" + + dpt.set_print_options(sign="+") + assert str(x) == "[+0. +1. +2.]" + assert str(y) == "+1." + assert str(z) == "[+1.+1.j]" + + dpt.set_print_options(sign=" ") + assert str(x) == "[ 0. 1. 2.]" + assert str(y) == " 1." + assert str(z) == "[ 1.+1.j]" + + def test_numpy(self): + dpt.set_print_options(numpy=True) + options = dpt.get_print_options() + np_options = np.get_printoptions() + assert all(np_options[k] == options[k] for k in options.keys()) + + +class TestPrintFns(TestPrint): + @pytest.mark.parametrize( + "dtype,x_str", + [ + ("b1", "[False True True True]"), + ("i1", "[0 1 2 3]"), + ("u1", "[0 1 2 3]"), + ("i2", "[0 1 2 3]"), + ("u2", "[0 1 2 3]"), + ("i4", "[0 1 2 3]"), + ("u4", "[0 1 2 3]"), + ("i8", "[0 1 2 3]"), + ("u8", "[0 1 2 3]"), + ("f2", "[0. 1. 2. 3.]"), + ("f4", "[0. 1. 2. 3.]"), + ("f8", "[0. 1. 2. 3.]"), + ("c8", "[0.+0.j 1.+0.j 2.+0.j 3.+0.j]"), + ("c16", "[0.+0.j 1.+0.j 2.+0.j 3.+0.j]"), + ], + ) + def test_print_types(self, dtype, x_str): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + x = dpt.asarray([0, 1, 2, 3], dtype=dtype, sycl_queue=q) + assert str(x) == x_str + + def test_print_str(self): + q = get_queue_or_skip() + + x = dpt.asarray(0, sycl_queue=q) + assert str(x) == "0" + + x = dpt.asarray([np.nan, np.inf], sycl_queue=q) + assert str(x) == "[nan inf]" + + x = dpt.arange(9, sycl_queue=q) + assert str(x) == "[0 1 2 3 4 5 6 7 8]" + + y = dpt.reshape(x, (3, 3), copy=True) + assert str(y) == "[[0 1 2]\n [3 4 5]\n [6 7 8]]" + + def test_print_str_abbreviated(self): + q = get_queue_or_skip() + + dpt.set_print_options(threshold=0, edgeitems=1) + x = dpt.arange(9, sycl_queue=q) + assert str(x) == "[0 ... 8]" + + x = dpt.reshape(x, (3, 3)) + assert str(x) == "[[0 ... 2]\n ...\n [6 ... 8]]" + + def test_usm_ndarray_str_separator(self): + q = get_queue_or_skip() + + x = dpt.reshape(dpt.arange(4, sycl_queue=q), (2, 2)) + + np.testing.assert_equal( + dpt.usm_ndarray_str(x, prefix="test", separator=" "), + "[[0 1]\n [2 3]]", + ) + + def test_print_repr(self): + q = get_queue_or_skip() + + x = dpt.asarray(3, dtype="int64", sycl_queue=q) + assert repr(x) == "usm_ndarray(3)" + + x = dpt.asarray([np.nan, np.inf], sycl_queue=q) + if x.sycl_device.has_aspect_fp64: + assert repr(x) == "usm_ndarray([nan, inf])" + else: + assert repr(x) == "usm_ndarray([nan, inf], dtype=float32)" + + x = dpt.arange(9, sycl_queue=q, dtype="int64") + assert repr(x) == "usm_ndarray([0, 1, 2, 3, 4, 5, 6, 7, 8])" + + x = dpt.reshape(x, (3, 3)) + np.testing.assert_equal( + repr(x), + "usm_ndarray([[0, 1, 2]," + "\n [3, 4, 5]," + "\n [6, 7, 8]])", + ) + + x = dpt.arange(4, dtype="i4", sycl_queue=q) + assert repr(x) == "usm_ndarray([0, 1, 2, 3], dtype=int32)" + + dpt.set_print_options(linewidth=1) + np.testing.assert_equal( + repr(x), + "usm_ndarray([0," + "\n 1," + "\n 2," + "\n 3]," + "\n dtype=int32)", + ) + + # zero-size array + dpt.set_print_options(linewidth=75) + x = dpt.ones((9, 0), dtype="i4", sycl_queue=q) + assert repr(x) == "usm_ndarray([], shape=(9, 0), dtype=int32)" + + def test_print_repr_abbreviated(self): + q = get_queue_or_skip() + + dpt.set_print_options(threshold=0, edgeitems=1) + x = dpt.arange(9, dtype="int64", sycl_queue=q) + assert repr(x) == "usm_ndarray([0, ..., 8], shape=(9,))" + + y = dpt.asarray(x, dtype="i4", copy=True) + assert repr(y) == "usm_ndarray([0, ..., 8], shape=(9,), dtype=int32)" + + x = dpt.reshape(x, (3, 3)) + np.testing.assert_equal( + repr(x), + "usm_ndarray([[0, ..., 2]," + "\n ...," + "\n [6, ..., 8]], shape=(3, 3))", + ) + + y = dpt.reshape(y, (3, 3)) + np.testing.assert_equal( + repr(y), + "usm_ndarray([[0, ..., 2]," + "\n ...," + "\n [6, ..., 8]], shape=(3, 3), dtype=int32)", + ) + + dpt.set_print_options(linewidth=1) + np.testing.assert_equal( + repr(y), + "usm_ndarray([[0," + "\n ...," + "\n 2]," + "\n ...," + "\n [6," + "\n ...," + "\n 8]]," + "\n shape=(3, 3)," + "\n dtype=int32)", + ) + + @pytest.mark.parametrize( + "dtype", + [ + "i1", + "u1", + "i2", + "u2", + "i4", + "u4", + "u8", + "f2", + "f4", + "c8", + ], + ) + def test_repr_appended_dtype(self, dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + x = dpt.empty(4, dtype=dtype) + assert repr(x).split("=")[-1][:-1] == x.dtype.name + + def test_usm_ndarray_repr_prefix(self): + q = get_queue_or_skip() + + x = dpt.arange(4, dtype=np.intp, sycl_queue=q) + np.testing.assert_equal( + dpt.usm_ndarray_repr(x, prefix="test"), "test([0, 1, 2, 3])" + ) + x = dpt.reshape(x, (2, 2)) + np.testing.assert_equal( + dpt.usm_ndarray_repr(x, prefix="test"), + "test([[0, 1]," "\n [2, 3]])", + ) + + +class TestContextManager: + def test_context_manager_basic(self): + options = dpt.get_print_options() + try: + X = dpt.asarray(1.234567) + except dpctl.SyclDeviceCreationError: + pytest.skip("No SYCL devices available") + with dpt.print_options(precision=4): + s = str(X) + assert s == "1.2346" + assert options == dpt.get_print_options() + + def test_context_manager_as(self): + with dpt.print_options(precision=4) as x: + options = x.copy() + assert options["precision"] == 4 diff --git a/dpnp/tests/tensor/test_usm_ndarray_reductions.py b/dpnp/tests/tensor/test_usm_ndarray_reductions.py new file mode 100644 index 000000000000..2c431efa936d --- /dev/null +++ b/dpnp/tests/tensor/test_usm_ndarray_reductions.py @@ -0,0 +1,704 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +from random import randrange + +import numpy as np +import pytest +from numpy.testing import assert_allclose + +import dpnp.tensor as dpt +from dpnp.tensor._tensor_impl import default_device_index_type + +from .helper import ( + get_queue_or_skip, + skip_if_dtype_not_supported, +) + +_no_complex_dtypes = [ + "?", + "i1", + "u1", + "i2", + "u2", + "i4", + "u4", + "i8", + "u8", + "f2", + "f4", + "f8", +] + +_all_dtypes = _no_complex_dtypes + [ + "c8", + "c16", +] + + +def test_max_min_axis(): + get_queue_or_skip() + + x = dpt.reshape( + dpt.arange((3 * 4 * 5 * 6 * 7), dtype="i4"), (3, 4, 5, 6, 7) + ) + + m = dpt.max(x, axis=(1, 2, -1)) + assert m.shape == (3, 6) + assert dpt.all(m == x[:, -1, -1, :, -1]) + + m = dpt.min(x, axis=(1, 2, -1)) + assert m.shape == (3, 6) + assert dpt.all(m == x[:, 0, 0, :, 0]) + + +def test_max_axis1_axis0(): + """See gh-1455""" + get_queue_or_skip() + + x = dpt.reshape(dpt.arange(3 * 4 * 5), (3, 4, 5)) + + m = dpt.max(x, axis=0) + assert dpt.all(m == x[-1, :, :]) + + x = dpt.flip(x, axis=2) + m = dpt.max(x, axis=2) + assert dpt.all(m == x[:, :, 0]) + + +def test_reduction_keepdims(): + get_queue_or_skip() + + n0, n1 = 3, 6 + x = dpt.ones((n0, 4, 5, n1, 7), dtype="i4") + m = dpt.max(x, axis=(1, 2, -1), keepdims=True) + + xx = dpt.reshape(dpt.permute_dims(x, (0, 3, 1, 2, -1)), (n0, n1, -1)) + p = dpt.argmax(xx, axis=-1, keepdims=True) + + assert m.shape == (n0, 1, 1, n1, 1) + assert dpt.all(m == dpt.reshape(x[:, 0, 0, :, 0], m.shape)) + assert dpt.all(p == 0) + + +def test_max_scalar(): + get_queue_or_skip() + + x = dpt.ones(()) + m = dpt.max(x) + + assert m.shape == () + assert x == m + + +@pytest.mark.parametrize("arg_dtype", ["i4", "f4", "c8"]) +def test_reduction_kernels(arg_dtype): + # i4 - always uses atomics w/ sycl group reduction + # f4 - always uses atomics w/ custom group reduction + # c8 - always uses temps w/ custom group reduction + q = get_queue_or_skip() + skip_if_dtype_not_supported(arg_dtype, q) + + x = dpt.ones((24, 1025), dtype=arg_dtype, sycl_queue=q) + x[x.shape[0] // 2, :] = 3 + x[:, x.shape[1] // 2] = 3 + + m = dpt.max(x) + assert m == 3 + m = dpt.max(x, axis=0) + assert dpt.all(m == 3) + m = dpt.max(x, axis=1) + assert dpt.all(m == 3) + + x = dpt.ones((24, 1025), dtype=arg_dtype, sycl_queue=q) + x[x.shape[0] // 2, :] = 0 + x[:, x.shape[1] // 2] = 0 + + m = dpt.min(x) + assert m == 0 + m = dpt.min(x, axis=0) + assert dpt.all(m == 0) + m = dpt.min(x, axis=1) + assert dpt.all(m == 0) + + +def test_max_min_nan_propagation(): + get_queue_or_skip() + + # float, finites + x = dpt.arange(4, dtype="f4") + x[0] = dpt.nan + assert dpt.isnan(dpt.max(x)) + assert dpt.isnan(dpt.min(x)) + + # float, infinities + x[1:] = dpt.inf + assert dpt.isnan(dpt.max(x)) + x[1:] = -dpt.inf + assert dpt.isnan(dpt.min(x)) + + # complex + x = dpt.arange(4, dtype="c8") + x[0] = complex(dpt.nan, 0) + assert dpt.isnan(dpt.max(x)) + assert dpt.isnan(dpt.min(x)) + + x[0] = complex(0, dpt.nan) + assert dpt.isnan(dpt.max(x)) + assert dpt.isnan(dpt.min(x)) + + +def test_argmax_scalar(): + get_queue_or_skip() + + x = dpt.ones(()) + m = dpt.argmax(x) + + assert m.shape == () + assert m == 0 + + +@pytest.mark.parametrize("arg_dtype", ["i4", "f4", "c8"]) +def test_search_reduction_kernels(arg_dtype): + # i4 - always uses atomics w/ sycl group reduction + # f4 - always uses atomics w/ custom group reduction + # c8 - always uses temps w/ custom group reduction + q = get_queue_or_skip() + skip_if_dtype_not_supported(arg_dtype, q) + + x_shape = (24, 1024) + x_size = np.prod(x_shape) + x = dpt.ones(x_size, dtype=arg_dtype, sycl_queue=q) + idx = randrange(x.size) + idx_tup = np.unravel_index(idx, x_shape) + x[idx] = 2 + + m = dpt.argmax(x) + assert m == idx + + # test case of strided input mapping to contig + # implementation + m = dpt.argmax(dpt.flip(x)) + assert m == x.size - 1 - idx + + # test case of strided implementation + y = dpt.ones(2 * x.size, dtype=arg_dtype, sycl_queue=q) + y[::2] = x + m = dpt.argmax(y) + assert m == 2 * idx + + x = dpt.reshape(x, x_shape) + + x[idx_tup[0], :] = 3 + m = dpt.argmax(x, axis=0) + assert dpt.all(m == idx_tup[0]) + x[:, idx_tup[1]] = 4 + m = dpt.argmax(x, axis=1) + assert dpt.all(m == idx_tup[1]) + + x = x[:, ::-2] + idx = randrange(x.shape[1]) + x[:, idx] = 5 + m = dpt.argmax(x, axis=1) + assert dpt.all(m == idx) + + x = dpt.ones(x_size, dtype=arg_dtype, sycl_queue=q) + idx = randrange(x.size) + idx_tup = np.unravel_index(idx, x_shape) + x[idx] = 0 + + m = dpt.argmin(x) + assert m == idx + + x = dpt.reshape(x, x_shape) + + x[idx_tup[0], :] = -1 + m = dpt.argmin(x, axis=0) + assert dpt.all(m == idx_tup[0]) + x[:, idx_tup[1]] = -2 + m = dpt.argmin(x, axis=1) + assert dpt.all(m == idx_tup[1]) + + x = x[:, ::-2] + idx = randrange(x.shape[1]) + x[:, idx] = -3 + m = dpt.argmin(x, axis=1) + assert dpt.all(m == idx) + + +def test_argmax_argmin_nan_propagation(): + get_queue_or_skip() + + sz = 4 + idx = randrange(sz) + # floats + x = dpt.arange(sz, dtype="f4") + x[idx] = dpt.nan + assert dpt.argmax(x) == idx + assert dpt.argmin(x) == idx + + # complex + x = dpt.arange(sz, dtype="c8") + x[idx] = complex(dpt.nan, 0) + assert dpt.argmax(x) == idx + assert dpt.argmin(x) == idx + + x[idx] = complex(0, dpt.nan) + assert dpt.argmax(x) == idx + assert dpt.argmin(x) == idx + + +def test_argmax_argmin_identities(): + # make sure that identity arrays work as expected + get_queue_or_skip() + + x = dpt.full(3, dpt.iinfo(dpt.int32).min, dtype="i4") + assert dpt.argmax(x) == 0 + x = dpt.full(3, dpt.iinfo(dpt.int32).max, dtype="i4") + assert dpt.argmin(x) == 0 + + +@pytest.mark.parametrize("order", ["C", "F"]) +def test_argmax_axis0_axis1(order): + get_queue_or_skip() + + x = dpt.asarray([[1, 2, 3], [6, 5, 4]], dtype="i4", order=order) + assert dpt.argmax(x) == 3 + + res = dpt.argmax(x, axis=0) + expected = dpt.asarray([1, 1, 1], dtype=res.dtype) + assert dpt.all(res == expected) + + res = dpt.argmax(x, axis=1) + expected = dpt.asarray([2, 0], dtype=res.dtype) + assert dpt.all(res == expected) + + +def test_reduction_arg_validation(): + get_queue_or_skip() + + x = {} + with pytest.raises(TypeError): + dpt.sum(x) + with pytest.raises(TypeError): + dpt.max(x) + with pytest.raises(TypeError): + dpt.argmax(x) + + x = dpt.zeros((0,), dtype="i4") + with pytest.raises(ValueError): + dpt.max(x) + with pytest.raises(ValueError): + dpt.argmax(x) + + +@pytest.mark.parametrize("arg_dtype", _no_complex_dtypes[1:]) +def test_logsumexp_arg_dtype_default_output_dtype_matrix(arg_dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(arg_dtype, q) + + m = dpt.ones(100, dtype=arg_dtype) + r = dpt.logsumexp(m) + + assert isinstance(r, dpt.usm_ndarray) + assert r.dtype.kind == "f" + tol = dpt.finfo(r.dtype).resolution + assert_allclose( + dpt.asnumpy(r), + np.logaddexp.reduce(dpt.asnumpy(m), dtype=r.dtype), + rtol=tol, + atol=tol, + ) + + +def test_logsumexp_empty(): + get_queue_or_skip() + x = dpt.empty((0,), dtype="f4") + y = dpt.logsumexp(x) + assert y.shape == () + assert y == -dpt.inf + + +def test_logsumexp_axis(): + get_queue_or_skip() + + m = dpt.ones((3, 4, 5, 6, 7), dtype="f4") + s = dpt.logsumexp(m, axis=(1, 2, -1)) + + assert isinstance(s, dpt.usm_ndarray) + assert s.shape == (3, 6) + tol = dpt.finfo(s.dtype).resolution + assert_allclose( + dpt.asnumpy(s), + np.logaddexp.reduce(dpt.asnumpy(m), axis=(1, 2, -1), dtype=s.dtype), + rtol=tol, + atol=tol, + ) + + +@pytest.mark.parametrize("arg_dtype", _no_complex_dtypes[1:]) +@pytest.mark.parametrize("out_dtype", _all_dtypes[1:]) +def test_logsumexp_arg_out_dtype_matrix(arg_dtype, out_dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(arg_dtype, q) + skip_if_dtype_not_supported(out_dtype, q) + + m = dpt.ones(100, dtype=arg_dtype) + r = dpt.logsumexp(m, dtype=out_dtype) + + assert isinstance(r, dpt.usm_ndarray) + assert r.dtype == dpt.dtype(out_dtype) + + +def test_logsumexp_keepdims(): + get_queue_or_skip() + + m = dpt.ones((3, 4, 5, 6, 7), dtype="i4") + s = dpt.logsumexp(m, axis=(1, 2, -1), keepdims=True) + + assert isinstance(s, dpt.usm_ndarray) + assert s.shape == (3, 1, 1, 6, 1) + + +def test_logsumexp_keepdims_zero_size(): + get_queue_or_skip() + n = 10 + a = dpt.ones((n, 0, n)) + + s1 = dpt.logsumexp(a, keepdims=True) + assert s1.shape == (1, 1, 1) + + s2 = dpt.logsumexp(a, axis=(0, 1), keepdims=True) + assert s2.shape == (1, 1, n) + + s3 = dpt.logsumexp(a, axis=(1, 2), keepdims=True) + assert s3.shape == (n, 1, 1) + + s4 = dpt.logsumexp(a, axis=(0, 2), keepdims=True) + assert s4.shape == (1, 0, 1) + + a0 = a[0] + s5 = dpt.logsumexp(a0, keepdims=True) + assert s5.shape == (1, 1) + + +def test_logsumexp_scalar(): + get_queue_or_skip() + + m = dpt.ones(()) + s = dpt.logsumexp(m) + + assert isinstance(s, dpt.usm_ndarray) + assert m.sycl_queue == s.sycl_queue + assert s.shape == () + + +def test_logsumexp_complex(): + get_queue_or_skip() + + x = dpt.zeros(1, dtype="c8") + with pytest.raises(ValueError): + dpt.logsumexp(x) + + +def test_logsumexp_int_axis(): + get_queue_or_skip() + + x = dpt.zeros((8, 10), dtype="f4") + res = dpt.logsumexp(x, axis=0) + assert res.ndim == 1 + assert res.shape[0] == 10 + + +def test_logsumexp_invalid_arr(): + x = {} + with pytest.raises(TypeError): + dpt.logsumexp(x) + + +@pytest.mark.parametrize("arg_dtype", _no_complex_dtypes[1:]) +def test_hypot_arg_dtype_default_output_dtype_matrix(arg_dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(arg_dtype, q) + + m = dpt.ones(100, dtype=arg_dtype) + r = dpt.reduce_hypot(m) + + assert isinstance(r, dpt.usm_ndarray) + assert r.dtype.kind == "f" + tol = dpt.finfo(r.dtype).resolution + assert_allclose( + dpt.asnumpy(r), + np.hypot.reduce(dpt.asnumpy(m), dtype=r.dtype), + rtol=tol, + atol=tol, + ) + + +def test_hypot_empty(): + get_queue_or_skip() + x = dpt.empty((0,), dtype="f4") + y = dpt.reduce_hypot(x) + assert y.shape == () + assert y == 0 + + +@pytest.mark.parametrize("arg_dtype", _no_complex_dtypes[1:]) +@pytest.mark.parametrize("out_dtype", _all_dtypes[1:]) +def test_hypot_arg_out_dtype_matrix(arg_dtype, out_dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(arg_dtype, q) + skip_if_dtype_not_supported(out_dtype, q) + + m = dpt.ones(100, dtype=arg_dtype) + r = dpt.reduce_hypot(m, dtype=out_dtype) + + assert isinstance(r, dpt.usm_ndarray) + assert r.dtype == dpt.dtype(out_dtype) + + +def test_hypot_complex(): + get_queue_or_skip() + + x = dpt.zeros(1, dtype="c8") + with pytest.raises(ValueError): + dpt.reduce_hypot(x) + + +def test_tree_reduction_axis1_axis0(): + """See gh-1455""" + get_queue_or_skip() + + x = dpt.reshape(dpt.arange(3 * 4 * 5, dtype="f4"), (3, 4, 5)) + + m = dpt.logsumexp(x, axis=0) + tol = dpt.finfo(m.dtype).resolution + assert_allclose( + dpt.asnumpy(m), + np.logaddexp.reduce(dpt.asnumpy(x), axis=0, dtype=m.dtype), + rtol=tol, + atol=tol, + ) + + x = dpt.flip(x, axis=2) + m = dpt.logsumexp(x, axis=2) + assert_allclose( + dpt.asnumpy(m), + np.logaddexp.reduce(dpt.asnumpy(x), axis=2, dtype=m.dtype), + rtol=tol, + atol=tol, + ) + + +def test_numeric_reduction_out_kwarg(): + get_queue_or_skip() + + n1, n2, n3 = 3, 4, 5 + x = dpt.ones((n1, n2, n3), dtype="i8") + out = dpt.zeros((2 * n1, 3 * n2), dtype="i8") + res = dpt.sum(x, axis=-1, out=out[::-2, 1::3]) + assert dpt.all(out[::-2, 0::3] == 0) + assert dpt.all(out[::-2, 2::3] == 0) + assert dpt.all(out[::-2, 1::3] == res) + assert dpt.all(out[::-2, 1::3] == 5) + + out = dpt.zeros((2 * n1, 3 * n2, 1), dtype="i8") + res = dpt.sum(x, axis=-1, keepdims=True, out=out[::-2, 1::3]) + assert res.shape == (n1, n2, 1) + assert dpt.all(out[::-2, 0::3] == 0) + assert dpt.all(out[::-2, 2::3] == 0) + assert dpt.all(out[::-2, 1::3] == res) + assert dpt.all(out[::-2, 1::3] == 5) + + res = dpt.sum(x, axis=0, out=x[-1]) + assert dpt.all(x[-1] == res) + assert dpt.all(x[-1] == 3) + assert dpt.all(x[0:-1] == 1) + + # test no-op case + x = dpt.ones((n1, n2, n3), dtype="i8") + out = dpt.zeros((2 * n1, 3 * n2, n3), dtype="i8") + res = dpt.sum(x, axis=(), out=out[::-2, 1::3]) + assert dpt.all(out[::-2, 0::3] == 0) + assert dpt.all(out[::-2, 2::3] == 0) + assert dpt.all(out[::-2, 1::3] == x) + + # test with dtype kwarg + x = dpt.ones((n1, n2, n3), dtype="i4") + out = dpt.zeros((2 * n1, 3 * n2), dtype="f4") + res = dpt.sum(x, axis=-1, dtype="f4", out=out[::-2, 1::3]) + zero_res = dpt.zeros_like(res) + assert dpt.allclose(out[::-2, 0::3], zero_res) + assert dpt.allclose(out[::-2, 2::3], zero_res) + assert dpt.allclose(out[::-2, 1::3], res) + assert dpt.allclose(out[::-2, 1::3], dpt.full_like(res, 5, dtype="f4")) + + +def test_comparison_reduction_out_kwarg(): + get_queue_or_skip() + + n1, n2, n3 = 3, 4, 5 + x = dpt.reshape(dpt.arange(n1 * n2 * n3, dtype="i4"), (n1, n2, n3)) + out = dpt.zeros((2 * n1, 3 * n2), dtype="i4") + res = dpt.max(x, axis=-1, out=out[::-2, 1::3]) + assert dpt.all(out[::-2, 0::3] == 0) + assert dpt.all(out[::-2, 2::3] == 0) + assert dpt.all(out[::-2, 1::3] == res) + assert dpt.all(out[::-2, 1::3] == x[:, :, -1]) + + out = dpt.zeros((2 * n1, 3 * n2, 1), dtype="i4") + res = dpt.max(x, axis=-1, keepdims=True, out=out[::-2, 1::3]) + assert res.shape == (n1, n2, 1) + assert dpt.all(out[::-2, 0::3] == 0) + assert dpt.all(out[::-2, 2::3] == 0) + assert dpt.all(out[::-2, 1::3] == res) + assert dpt.all(out[::-2, 1::3] == x[:, :, -1, dpt.newaxis]) + + # test no-op case + out = dpt.zeros((2 * n1, 3 * n2, n3), dtype="i4") + res = dpt.max(x, axis=(), out=out[::-2, 1::3]) + assert dpt.all(out[::-2, 0::3] == 0) + assert dpt.all(out[::-2, 2::3] == 0) + assert dpt.all(out[::-2, 1::3] == x) + + # test overlap + res = dpt.max(x, axis=0, out=x[0]) + assert dpt.all(x[0] == res) + assert dpt.all(x[0] == x[-1]) + + +def test_search_reduction_out_kwarg(): + get_queue_or_skip() + + n1, n2, n3 = 3, 4, 5 + dt = dpt.__array_namespace_info__().default_dtypes()["indexing"] + + x = dpt.reshape(dpt.arange(n1 * n2 * n3, dtype=dt), (n1, n2, n3)) + out = dpt.zeros((2 * n1, 3 * n2), dtype=dt) + res = dpt.argmax(x, axis=-1, out=out[::-2, 1::3]) + assert dpt.all(out[::-2, 0::3] == 0) + assert dpt.all(out[::-2, 2::3] == 0) + assert dpt.all(out[::-2, 1::3] == res) + assert dpt.all(out[::-2, 1::3] == n2) + + out = dpt.zeros((2 * n1, 3 * n2, 1), dtype=dt) + res = dpt.argmax(x, axis=-1, keepdims=True, out=out[::-2, 1::3]) + assert res.shape == (n1, n2, 1) + assert dpt.all(out[::-2, 0::3] == 0) + assert dpt.all(out[::-2, 2::3] == 0) + assert dpt.all(out[::-2, 1::3] == res) + assert dpt.all(out[::-2, 1::3] == n3 - 1) + + # test no-op case + x = dpt.ones((), dtype=dt) + out = dpt.ones(2, dtype=dt) + res = dpt.argmax(x, axis=None, out=out[1]) + assert dpt.all(out[0] == 1) + assert dpt.all(out[1] == 0) + + # test overlap + x = dpt.reshape(dpt.arange(n1 * n2, dtype=dt), (n1, n2)) + res = dpt.argmax(x, axis=0, out=x[0]) + assert dpt.all(x[0] == res) + assert dpt.all(x[0] == n1 - 1) + + +def test_reduction_out_kwarg_arg_validation(): + q1 = get_queue_or_skip() + q2 = get_queue_or_skip() + + ind_dt = dpt.__array_namespace_info__().default_dtypes()["indexing"] + + x = dpt.ones(10, dtype="f4") + out_wrong_queue = dpt.empty((), dtype="f4", sycl_queue=q2) + out_wrong_dtype = dpt.empty((), dtype="i4", sycl_queue=q1) + out_wrong_shape = dpt.empty(1, dtype="f4", sycl_queue=q1) + out_wrong_keepdims = dpt.empty((), dtype="f4", sycl_queue=q1) + out_not_writable = dpt.empty((), dtype="f4", sycl_queue=q1) + out_not_writable.flags["W"] = False + + with pytest.raises(TypeError): + dpt.sum(x, out=dict()) + with pytest.raises(TypeError): + dpt.max(x, out=dict()) + with pytest.raises(TypeError): + dpt.argmax(x, out=dict()) + with pytest.raises(dpt.ExecutionPlacementError): + dpt.sum(x, out=out_wrong_queue) + with pytest.raises(dpt.ExecutionPlacementError): + dpt.max(x, out=out_wrong_queue) + with pytest.raises(dpt.ExecutionPlacementError): + dpt.argmax(x, out=dpt.empty_like(out_wrong_queue, dtype=ind_dt)) + with pytest.raises(ValueError): + dpt.sum(x, out=out_wrong_dtype) + with pytest.raises(ValueError): + dpt.max(x, out=out_wrong_dtype) + with pytest.raises(ValueError): + dpt.argmax(x, out=dpt.empty_like(out_wrong_dtype, dtype="f4")) + with pytest.raises(ValueError): + dpt.sum(x, out=out_wrong_shape) + with pytest.raises(ValueError): + dpt.max(x, out=out_wrong_shape) + with pytest.raises(ValueError): + dpt.argmax(x, out=dpt.empty_like(out_wrong_shape, dtype=ind_dt)) + with pytest.raises(ValueError): + dpt.sum(x, out=out_not_writable) + with pytest.raises(ValueError): + dpt.max(x, out=out_not_writable) + with pytest.raises(ValueError): + search_not_writable = dpt.empty_like(out_not_writable, dtype=ind_dt) + search_not_writable.flags["W"] = False + dpt.argmax(x, out=search_not_writable) + with pytest.raises(ValueError): + dpt.sum(x, keepdims=True, out=out_wrong_keepdims) + with pytest.raises(ValueError): + dpt.max(x, keepdims=True, out=out_wrong_keepdims) + with pytest.raises(ValueError): + dpt.argmax( + x, + keepdims=True, + out=dpt.empty_like(out_wrong_keepdims, dtype=ind_dt), + ) + + +@pytest.mark.parametrize("dt", _all_dtypes) +def test_count_nonzero(dt): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dt, q) + + expected_dt = default_device_index_type(q.sycl_device) + + x = dpt.ones(10, dtype=dt, sycl_queue=q) + res = dpt.count_nonzero(x) + assert res == 10 + assert res.dtype == expected_dt + + x[3:6] = 0 + res = dpt.count_nonzero(x) + assert res == 7 + assert res.dtype == expected_dt diff --git a/dpnp/tests/tensor/test_usm_ndarray_search_functions.py b/dpnp/tests/tensor/test_usm_ndarray_search_functions.py new file mode 100644 index 000000000000..33942d93c3a7 --- /dev/null +++ b/dpnp/tests/tensor/test_usm_ndarray_search_functions.py @@ -0,0 +1,593 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import ctypes +import itertools + +import numpy as np +import pytest +from numpy.testing import assert_array_equal + +import dpnp.tensor as dpt +from dpnp.tensor._search_functions import _where_result_type +from dpnp.tensor._type_utils import _all_data_types + +from .helper import ( + get_queue_or_skip, + skip_if_dtype_not_supported, +) + +_all_dtypes = [ + "?", + "u1", + "i1", + "u2", + "i2", + "u4", + "i4", + "u8", + "i8", + "e", + "f", + "d", + "F", + "D", +] + + +class mock_device: + def __init__(self, fp16, fp64): + self.has_aspect_fp16 = fp16 + self.has_aspect_fp64 = fp64 + + +def test_where_basic(): + get_queue_or_skip() + + cond = dpt.asarray( + [ + [True, False, False], + [False, True, False], + [False, False, True], + [False, False, False], + [True, True, True], + ] + ) + out = dpt.where(cond, dpt.asarray(1), dpt.asarray(0)) + out_expected = dpt.asarray( + [[1, 0, 0], [0, 1, 0], [0, 0, 1], [0, 0, 0], [1, 1, 1]] + ) + assert (dpt.asnumpy(out) == dpt.asnumpy(out_expected)).all() + + out = dpt.where(cond, dpt.ones(cond.shape), dpt.zeros(cond.shape)) + assert (dpt.asnumpy(out) == dpt.asnumpy(out_expected)).all() + + out = dpt.where( + cond, + dpt.ones(cond.shape[0], dtype="i4")[:, dpt.newaxis], + dpt.zeros(cond.shape[0], dtype="i4")[:, dpt.newaxis], + ) + assert (dpt.asnumpy(out) == dpt.asnumpy(out_expected)).all() + + +def _dtype_all_close(x1, x2): + if np.issubdtype(x2.dtype, np.floating) or np.issubdtype( + x2.dtype, np.complexfloating + ): + x2_dtype = x2.dtype + return np.allclose( + x1, x2, atol=np.finfo(x2_dtype).eps, rtol=np.finfo(x2_dtype).eps + ) + else: + return np.allclose(x1, x2) + + +@pytest.mark.parametrize("dt1", _all_dtypes) +@pytest.mark.parametrize("dt2", _all_dtypes) +@pytest.mark.parametrize("fp16", [True, False]) +@pytest.mark.parametrize("fp64", [True, False]) +def test_where_result_types(dt1, dt2, fp16, fp64): + dev = mock_device(fp16, fp64) + + dt1 = dpt.dtype(dt1) + dt2 = dpt.dtype(dt2) + res_t = _where_result_type(dt1, dt2, dev) + + if fp16 and fp64: + assert res_t == dpt.result_type(dt1, dt2) + else: + if res_t: + assert res_t.kind == dpt.result_type(dt1, dt2).kind + else: + # some illegal cases are covered above, but + # this guarantees that _where_result_type + # produces None only when one of the dtypes + # is illegal given fp aspects of device + all_dts = _all_data_types(fp16, fp64) + assert dt1 not in all_dts or dt2 not in all_dts + + +@pytest.mark.parametrize("dt", _all_dtypes) +def test_where_mask_dtypes(dt): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dt, q) + + # mask dtype changes + cond = dpt.asarray([0, 1, 3, 0, 10], dtype=dt, sycl_queue=q) + x1 = dpt.asarray(0, dtype="f4", sycl_queue=q) + x2 = dpt.asarray(1, dtype="f4", sycl_queue=q) + res = dpt.where(cond, x1, x2) + + res_check = np.asarray([1, 0, 0, 1, 0], dtype=res.dtype) + assert _dtype_all_close(dpt.asnumpy(res), res_check) + + # contiguous cases + x1 = dpt.full(cond.shape, 0, dtype="f4", sycl_queue=q) + x2 = dpt.full(cond.shape, 1, dtype="f4", sycl_queue=q) + res = dpt.where(cond, x1, x2) + assert _dtype_all_close(dpt.asnumpy(res), res_check) + + # input array dtype changes + cond = dpt.asarray([False, True, True, False, True], sycl_queue=q) + x1 = dpt.asarray(0, dtype=dt, sycl_queue=q) + x2 = dpt.asarray(1, dtype=dt, sycl_queue=q) + res = dpt.where(cond, x1, x2) + + res_check = np.asarray([1, 0, 0, 1, 0], dtype=res.dtype) + assert _dtype_all_close(dpt.asnumpy(res), res_check) + + # contiguous cases + x1 = dpt.full(cond.shape, 0, dtype=dt, sycl_queue=q) + x2 = dpt.full(cond.shape, 1, dtype=dt, sycl_queue=q) + res = dpt.where(cond, x1, x2) + assert _dtype_all_close(dpt.asnumpy(res), res_check) + + +def test_where_asymmetric_dtypes(): + q = get_queue_or_skip() + + cond = dpt.asarray([0, 1, 3, 0, 10], dtype="?", sycl_queue=q) + x1 = dpt.asarray(2, dtype="i4", sycl_queue=q) + x2 = dpt.asarray(3, dtype="i8", sycl_queue=q) + + res = dpt.where(cond, x1, x2) + res_check = np.asarray([3, 2, 2, 3, 2], dtype=res.dtype) + assert _dtype_all_close(dpt.asnumpy(res), res_check) + + # flip order + + res = dpt.where(cond, x2, x1) + res_check = np.asarray([2, 3, 3, 2, 3], dtype=res.dtype) + assert _dtype_all_close(dpt.asnumpy(res), res_check) + + +def test_where_nan_inf(): + get_queue_or_skip() + + cond = dpt.asarray([True, False, True, False], dtype="?") + x1 = dpt.asarray([np.nan, 2.0, np.inf, 3.0], dtype="f4") + x2 = dpt.asarray([2.0, np.nan, 3.0, np.inf], dtype="f4") + + cond_np = dpt.asnumpy(cond) + x1_np = dpt.asnumpy(x1) + x2_np = dpt.asnumpy(x2) + + res = dpt.where(cond, x1, x2) + res_np = np.where(cond_np, x1_np, x2_np) + + assert np.allclose(dpt.asnumpy(res), res_np, equal_nan=True) + + res = dpt.where(x1, cond, x2) + res_np = np.where(x1_np, cond_np, x2_np) + assert _dtype_all_close(dpt.asnumpy(res), res_np) + + +def test_where_empty(): + # check that numpy returns same results when + # handling empty arrays + get_queue_or_skip() + + empty = dpt.empty(0, dtype="i2") + m = dpt.asarray(True) + x1 = dpt.asarray(1, dtype="i2") + x2 = dpt.asarray(2, dtype="i2") + res = dpt.where(empty, x1, x2) + + empty_np = np.empty(0, dtype="i2") + m_np = dpt.asnumpy(m) + x1_np = dpt.asnumpy(x1) + x2_np = dpt.asnumpy(x2) + res_np = np.where(empty_np, x1_np, x2_np) + + assert_array_equal(dpt.asnumpy(res), res_np) + + res = dpt.where(m, empty, x2) + res_np = np.where(m_np, empty_np, x2_np) + + assert_array_equal(dpt.asnumpy(res), res_np) + + # check that broadcasting is performed + with pytest.raises(ValueError): + dpt.where(empty, x1, dpt.empty((1, 2))) + + +@pytest.mark.parametrize("order", ["C", "F"]) +def test_where_contiguous(order): + get_queue_or_skip() + + cond = dpt.asarray( + [ + [[True, False, False], [False, True, True]], + [[False, True, False], [True, False, True]], + [[False, False, True], [False, False, True]], + [[False, False, False], [True, False, True]], + [[True, True, True], [True, False, True]], + ], + order=order, + ) + + x1 = dpt.full(cond.shape, 2, dtype="i4", order=order) + x2 = dpt.full(cond.shape, 3, dtype="i4", order=order) + expected = np.where(dpt.asnumpy(cond), dpt.asnumpy(x1), dpt.asnumpy(x2)) + res = dpt.where(cond, x1, x2) + + assert _dtype_all_close(dpt.asnumpy(res), expected) + + +def test_where_contiguous1D(): + get_queue_or_skip() + + cond = dpt.asarray([True, False, True, False, False, True]) + + x1 = dpt.full(cond.shape, 2, dtype="i4") + x2 = dpt.full(cond.shape, 3, dtype="i4") + expected = np.where(dpt.asnumpy(cond), dpt.asnumpy(x1), dpt.asnumpy(x2)) + res = dpt.where(cond, x1, x2) + assert_array_equal(dpt.asnumpy(res), expected) + + # test with complex dtype (branch in kernel) + x1 = dpt.astype(x1, dpt.complex64) + x2 = dpt.astype(x2, dpt.complex64) + expected = np.where(dpt.asnumpy(cond), dpt.asnumpy(x1), dpt.asnumpy(x2)) + res = dpt.where(cond, x1, x2) + assert _dtype_all_close(dpt.asnumpy(res), expected) + + +def test_where_gh_1170(): + get_queue_or_skip() + + cond = dpt.asarray([False, True, True, False], dtype="?") + x1 = dpt.ones((3, 4), dtype="i4") + x2 = dpt.zeros((3, 4), dtype="i4") + + res = dpt.where(cond, x1, x2) + expected = np.broadcast_to(dpt.asnumpy(cond).astype(x1.dtype), x1.shape) + + assert_array_equal(dpt.asnumpy(res), expected) + + +def test_where_strided(): + get_queue_or_skip() + + s0, s1 = 4, 9 + cond = dpt.reshape( + dpt.asarray( + [True, False, False, False, True, True, False, True, False] * s0 + ), + (s0, s1), + )[:, ::3] + + x1 = dpt.reshape( + dpt.arange(cond.shape[0] * cond.shape[1] * 2, dtype="i4"), + (cond.shape[0], cond.shape[1] * 2), + )[:, ::2] + x2 = dpt.reshape( + dpt.arange(cond.shape[0] * cond.shape[1] * 3, dtype="i4"), + (cond.shape[0], cond.shape[1] * 3), + )[:, ::3] + expected = np.where(dpt.asnumpy(cond), dpt.asnumpy(x1), dpt.asnumpy(x2)) + res = dpt.where(cond, x1, x2) + + assert_array_equal(dpt.asnumpy(res), expected) + + # negative strides + res = dpt.where(cond, dpt.flip(x1), x2) + expected = np.where( + dpt.asnumpy(cond), np.flip(dpt.asnumpy(x1)), dpt.asnumpy(x2) + ) + assert_array_equal(dpt.asnumpy(res), expected) + + res = dpt.where(dpt.flip(cond), x1, x2) + expected = np.where( + np.flip(dpt.asnumpy(cond)), dpt.asnumpy(x1), dpt.asnumpy(x2) + ) + assert_array_equal(dpt.asnumpy(res), expected) + + +def test_where_invariants(): + get_queue_or_skip() + + test_sh = ( + 6, + 8, + ) + mask = dpt.asarray(np.random.choice([True, False], size=test_sh)) + p = dpt.ones(test_sh, dtype=dpt.int16) + m = dpt.full(test_sh, -1, dtype=dpt.int16) + inds_list = [ + ( + np.s_[:3], + np.s_[::2], + ), + ( + np.s_[::2], + np.s_[::2], + ), + ( + np.s_[::-1], + np.s_[:], + ), + ] + for ind in inds_list: + r1 = dpt.where(mask, p, m)[ind] + r2 = dpt.where(mask[ind], p[ind], m[ind]) + assert (dpt.asnumpy(r1) == dpt.asnumpy(r2)).all() + + +def test_where_arg_validation(): + get_queue_or_skip() + + check = {} + x1 = dpt.empty((1,), dtype="i4") + x2 = dpt.empty((1,), dtype="i4") + + with pytest.raises(TypeError): + dpt.where(check, x1, x2) + with pytest.raises(ValueError): + dpt.where(x1, check, x2) + with pytest.raises(ValueError): + dpt.where(x1, x2, check) + + +def test_where_compute_follows_data(): + q1 = get_queue_or_skip() + q2 = get_queue_or_skip() + q3 = get_queue_or_skip() + + x1 = dpt.empty((1,), dtype="i4", sycl_queue=q1) + x2 = dpt.empty((1,), dtype="i4", sycl_queue=q2) + + with pytest.raises(dpt.ExecutionPlacementError): + dpt.where(dpt.empty((1,), dtype="i4", sycl_queue=q1), x1, x2) + with pytest.raises(dpt.ExecutionPlacementError): + dpt.where(dpt.empty((1,), dtype="i4", sycl_queue=q3), x1, x2) + with pytest.raises(dpt.ExecutionPlacementError): + dpt.where(x1, x1, x2) + + +def test_where_order(): + get_queue_or_skip() + + test_sh = ( + 20, + 20, + ) + test_sh2 = tuple(2 * dim for dim in test_sh) + n = test_sh[-1] + + for dt1, dt2 in zip(["i4", "i4", "f4"], ["i4", "f4", "i4"]): + ar1 = dpt.zeros(test_sh, dtype=dt1, order="C") + ar2 = dpt.ones(test_sh, dtype=dt2, order="C") + condition = dpt.zeros(test_sh, dtype="?", order="C") + res1 = dpt.where(condition, ar1, ar2, order="C") + assert res1.flags.c_contiguous + res2 = dpt.where(condition, ar1, ar2, order="F") + assert res2.flags.f_contiguous + res3 = dpt.where(condition, ar1, ar2, order="A") + assert res3.flags.c_contiguous + res4 = dpt.where(condition, ar1, ar2, order="K") + assert res4.flags.c_contiguous + + ar1 = dpt.ones(test_sh, dtype=dt1, order="F") + ar2 = dpt.ones(test_sh, dtype=dt2, order="F") + condition = dpt.zeros(test_sh, dtype="?", order="F") + res1 = dpt.where(condition, ar1, ar2, order="C") + assert res1.flags.c_contiguous + res2 = dpt.where(condition, ar1, ar2, order="F") + assert res2.flags.f_contiguous + res3 = dpt.where(condition, ar1, ar2, order="A") + assert res2.flags.f_contiguous + res4 = dpt.where(condition, ar1, ar2, order="K") + assert res4.flags.f_contiguous + + ar1 = dpt.ones(test_sh2, dtype=dt1, order="C")[:20, ::-2] + ar2 = dpt.ones(test_sh2, dtype=dt2, order="C")[:20, ::-2] + condition = dpt.zeros(test_sh2, dtype="?", order="C")[:20, ::-2] + res1 = dpt.where(condition, ar1, ar2, order="K") + assert res1.strides == (n, -1) + res2 = dpt.where(condition, ar1, ar2, order="C") + assert res2.strides == (n, 1) + + ar1 = dpt.ones(test_sh2, dtype=dt1, order="C")[:20, ::-2].mT + ar2 = dpt.ones(test_sh2, dtype=dt2, order="C")[:20, ::-2].mT + condition = dpt.zeros(test_sh2, dtype="?", order="C")[:20, ::-2].mT + res1 = dpt.where(condition, ar1, ar2, order="K") + assert res1.strides == (-1, n) + res2 = dpt.where(condition, ar1, ar2, order="C") + assert res2.strides == (n, 1) + + ar1 = dpt.ones(n, dtype=dt1, order="C") + ar2 = dpt.broadcast_to(dpt.ones(n, dtype=dt2, order="C"), test_sh) + condition = dpt.zeros(n, dtype="?", order="C") + res = dpt.where(condition, ar1, ar2, order="K") + assert res.strides == (20, 1) + + +def test_where_unaligned(): + get_queue_or_skip() + + x = dpt.ones(513, dtype="i4") + a = dpt.full(512, 2, dtype="i4") + b = dpt.zeros(512, dtype="i4") + + expected = dpt.full(512, 2, dtype="i4") + assert dpt.all(dpt.where(x[1:], a, b) == expected) + + +def test_where_out(): + get_queue_or_skip() + + n1, n2, n3 = 3, 4, 5 + ar1 = dpt.reshape(dpt.arange(n1 * n2 * n3, dtype="i4"), (n1, n2, n3)) + ar2 = dpt.full_like(ar1, -5) + condition = dpt.tile( + dpt.reshape( + dpt.asarray([True, False, False, True], dtype="?"), (1, n2, 1) + ), + (n1, 1, n3), + ) + + out = dpt.zeros((2 * n1, 3 * n2, n3), dtype="i4") + res = dpt.where(condition, ar1, ar2, out=out[::-2, 1::3, :]) + + assert dpt.all(res == out[::-2, 1::3, :]) + assert dpt.all(out[::-2, 0::3, :] == 0) + assert dpt.all(out[::-2, 2::3, :] == 0) + + assert dpt.all(res[:, 1:3, :] == -5) + assert dpt.all(res[:, 0, :] == ar1[:, 0, :]) + assert dpt.all(res[:, 3, :] == ar1[:, 3, :]) + + condition = dpt.tile( + dpt.reshape(dpt.asarray([1, 0], dtype="i4"), (1, 2, 1)), + (n1, 2, n3), + ) + res = dpt.where( + condition[:, ::-1, :], condition[:, ::-1, :], condition, out=condition + ) + assert dpt.all(res == condition) + assert dpt.all(condition == 1) + + condition = dpt.tile( + dpt.reshape(dpt.asarray([True, False], dtype="?"), (1, 2, 1)), + (n1, 2, n3), + ) + ar1 = dpt.full((n1, n2, n3), 7, dtype="i4") + ar2 = dpt.full_like(ar1, -5) + res = dpt.where(condition, ar1, ar2, out=ar2[:, ::-1, :]) + assert dpt.all(ar2[:, ::-1, :] == res) + assert dpt.all(ar2[:, ::2, :] == -5) + assert dpt.all(ar2[:, 1::2, :] == 7) + + condition = dpt.tile( + dpt.reshape(dpt.asarray([True, False], dtype="?"), (1, 2, 1)), + (n1, 2, n3), + ) + ar1 = dpt.full((n1, n2, n3), 7, dtype="i4") + ar2 = dpt.full_like(ar1, -5) + res = dpt.where(condition, ar1, ar2, out=ar1[:, ::-1, :]) + assert dpt.all(ar1[:, ::-1, :] == res) + assert dpt.all(ar1[:, ::2, :] == -5) + assert dpt.all(ar1[:, 1::2, :] == 7) + + +def test_where_out_arg_validation(): + q1 = get_queue_or_skip() + q2 = get_queue_or_skip() + + condition = dpt.ones(5, dtype="i4", sycl_queue=q1) + x1 = dpt.ones(5, dtype="i4", sycl_queue=q1) + x2 = dpt.ones(5, dtype="i4", sycl_queue=q1) + + out_wrong_queue = dpt.empty_like(condition, sycl_queue=q2) + out_wrong_dtype = dpt.empty_like(condition, dtype="f4") + out_wrong_shape = dpt.empty(6, dtype="i4", sycl_queue=q1) + out_not_writable = dpt.empty_like(condition) + out_not_writable.flags["W"] = False + + with pytest.raises(TypeError): + dpt.where(condition, x1, x2, out=dict()) + with pytest.raises(dpt.ExecutionPlacementError): + dpt.where(condition, x1, x2, out=out_wrong_queue) + with pytest.raises(ValueError): + dpt.where(condition, x1, x2, out=out_wrong_dtype) + with pytest.raises(ValueError): + dpt.where(condition, x1, x2, out=out_wrong_shape) + with pytest.raises(ValueError): + dpt.where(condition, x1, x2, out=out_not_writable) + + +@pytest.mark.parametrize("arr_dt", _all_dtypes) +def test_where_python_scalar(arr_dt): + q = get_queue_or_skip() + skip_if_dtype_not_supported(arr_dt, q) + + n1, n2 = 10, 10 + condition = dpt.tile( + dpt.reshape( + dpt.asarray([True, False], dtype="?", sycl_queue=q), (1, 2) + ), + (n1, n2 // 2), + ) + x = dpt.zeros((n1, n2), dtype=arr_dt, sycl_queue=q) + py_scalars = ( + bool(0), + int(0), + float(0), + complex(0), + np.float32(0), + ctypes.c_int(0), + ) + for sc in py_scalars: + r = dpt.where(condition, x, sc) + assert isinstance(r, dpt.usm_ndarray) + r = dpt.where(condition, sc, x) + assert isinstance(r, dpt.usm_ndarray) + + +def test_where_two_python_scalars(): + get_queue_or_skip() + + n1, n2 = 10, 10 + condition = dpt.tile( + dpt.reshape(dpt.asarray([True, False], dtype="?"), (1, 2)), + (n1, n2 // 2), + ) + + py_scalars = [ + bool(0), + int(0), + float(0), + complex(0), + np.float32(0), + ctypes.c_int(0), + ] + + for sc1, sc2 in itertools.product(py_scalars, repeat=2): + r = dpt.where(condition, sc1, sc2) + assert isinstance(r, dpt.usm_ndarray) diff --git a/dpnp/tests/tensor/test_usm_ndarray_searchsorted.py b/dpnp/tests/tensor/test_usm_ndarray_searchsorted.py new file mode 100644 index 000000000000..aef782f06f08 --- /dev/null +++ b/dpnp/tests/tensor/test_usm_ndarray_searchsorted.py @@ -0,0 +1,407 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import dpctl +import numpy as np +import pytest + +import dpnp.tensor as dpt + +from .helper import ( + get_queue_or_skip, + skip_if_dtype_not_supported, +) + + +def _check(hay_stack, needles, needles_np): + assert hay_stack.dtype == needles.dtype + assert hay_stack.ndim == 1 + + info_ = dpt.__array_namespace_info__() + default_dts_dev = info_.default_dtypes(device=hay_stack.device) + index_dt = default_dts_dev["indexing"] + + p_left = dpt.searchsorted(hay_stack, needles, side="left") + assert p_left.dtype == index_dt + + hs_np = dpt.asnumpy(hay_stack) + ref_left = np.searchsorted(hs_np, needles_np, side="left") + assert dpt.all(p_left == dpt.asarray(ref_left)) + + p_right = dpt.searchsorted(hay_stack, needles, side="right") + assert p_right.dtype == index_dt + + ref_right = np.searchsorted(hs_np, needles_np, side="right") + assert dpt.all(p_right == dpt.asarray(ref_right)) + + sorter = dpt.arange(hay_stack.size) + ps_left = dpt.searchsorted(hay_stack, needles, side="left", sorter=sorter) + assert ps_left.dtype == index_dt + assert dpt.all(ps_left == p_left) + ps_right = dpt.searchsorted(hay_stack, needles, side="right", sorter=sorter) + assert ps_right.dtype == index_dt + assert dpt.all(ps_right == p_right) + + +def test_searchsorted_contig_bool(): + get_queue_or_skip() + + dt = dpt.bool + + hay_stack = dpt.arange(0, 1, dtype=dt) + needles_np = np.random.choice([True, False], size=1024) + needles = dpt.asarray(needles_np) + + _check(hay_stack, needles, needles_np) + _check( + hay_stack, + dpt.reshape(needles, (32, 32)), + np.reshape(needles_np, (32, 32)), + ) + + +def test_searchsorted_strided_bool(): + get_queue_or_skip() + + dt = dpt.bool + + hay_stack = dpt.repeat(dpt.arange(0, 1, dtype=dt), 4)[::4] + needles_np = np.random.choice([True, False], size=2 * 1024) + needles = dpt.asarray(needles_np) + sl = slice(None, None, -2) + + _check(hay_stack, needles[sl], needles_np[sl]) + _check( + hay_stack, + dpt.reshape(needles[sl], (32, 32)), + np.reshape(needles_np[sl], (32, 32)), + ) + + +@pytest.mark.parametrize( + "idt", + [ + dpt.int8, + dpt.uint8, + dpt.int16, + dpt.uint16, + dpt.int32, + dpt.uint32, + dpt.int64, + dpt.uint64, + ], +) +def test_searchsorted_contig_int(idt): + q = get_queue_or_skip() + skip_if_dtype_not_supported(idt, q) + + dt = dpt.dtype(idt) + max_v = dpt.iinfo(dt).max + + hay_stack = dpt.arange(0, min(max_v, 255), dtype=dt) + needles_np = np.random.randint(0, max_v, dtype=dt, size=1024) + needles = dpt.asarray(needles_np) + + _check(hay_stack, needles, needles_np) + _check( + hay_stack, + dpt.reshape(needles, (32, 32)), + np.reshape(needles_np, (32, 32)), + ) + + +@pytest.mark.parametrize( + "idt", + [ + dpt.int8, + dpt.uint8, + dpt.int16, + dpt.uint16, + dpt.int32, + dpt.uint32, + dpt.int64, + dpt.uint64, + ], +) +def test_searchsorted_strided_int(idt): + q = get_queue_or_skip() + skip_if_dtype_not_supported(idt, q) + + dt = dpt.dtype(idt) + max_v = dpt.iinfo(dt).max + + hay_stack = dpt.repeat(dpt.arange(0, min(max_v, 255), dtype=dt), 4)[1::4] + needles_np = np.random.randint(0, max_v, dtype=dt, size=2 * 1024) + needles = dpt.asarray(needles_np) + sl = slice(None, None, -2) + + _check(hay_stack, needles[sl], needles_np[sl]) + _check( + hay_stack, + dpt.reshape(needles[sl], (32, 32)), + np.reshape(needles_np[sl], (32, 32)), + ) + + +def _add_extended_fp(array): + array[0] = -dpt.inf + array[-2] = dpt.inf + array[-1] = dpt.nan + + +@pytest.mark.parametrize("idt", [dpt.float16, dpt.float32, dpt.float64]) +def test_searchsorted_contig_fp(idt): + q = get_queue_or_skip() + skip_if_dtype_not_supported(idt, q) + + dt = dpt.dtype(idt) + + hay_stack = dpt.linspace(0, 1, num=255, dtype=dt, endpoint=True) + _add_extended_fp(hay_stack) + + needles_np = np.random.uniform(-0.1, 1.1, size=1024).astype(dt) + needles = dpt.asarray(needles_np) + + _check(hay_stack, needles, needles_np) + _check( + hay_stack, + dpt.reshape(needles, (32, 32)), + np.reshape(needles_np, (32, 32)), + ) + + +@pytest.mark.parametrize("idt", [dpt.float16, dpt.float32, dpt.float64]) +def test_searchsorted_strided_fp(idt): + q = get_queue_or_skip() + skip_if_dtype_not_supported(idt, q) + + dt = dpt.dtype(idt) + + hay_stack = dpt.repeat( + dpt.linspace(0, 1, num=255, dtype=dt, endpoint=True), 4 + )[1::4] + _add_extended_fp(hay_stack) + + needles_np = np.random.uniform(-0.1, 1.1, size=3 * 1024).astype(dt) + needles = dpt.asarray(needles_np) + sl = slice(1, None, 3) + + _check(hay_stack, needles[sl], needles_np[sl]) + _check( + hay_stack, + dpt.reshape(needles[sl], (32, 32)), + np.reshape(needles_np[sl], (32, 32)), + ) + + +def _add_extended_cfp(array): + dt = array.dtype + ev_li = [ + complex(-dpt.inf, -1), + complex(-dpt.inf, -dpt.inf), + complex(-dpt.inf, dpt.inf), + complex(-dpt.inf, dpt.nan), + complex(0, -dpt.inf), + complex(0, -1), + complex(0, dpt.inf), + complex(0, dpt.nan), + complex(dpt.inf, -dpt.inf), + complex(dpt.inf, -1), + complex(dpt.inf, dpt.inf), + complex(dpt.inf, dpt.nan), + complex(dpt.nan, -dpt.inf), + complex(dpt.nan, -1), + complex(dpt.nan, dpt.inf), + complex(dpt.nan, dpt.nan), + ] + ev = dpt.asarray(ev_li, dtype=dt, device=array.device) + return dpt.sort(dpt.concat((ev, array))) + + +@pytest.mark.parametrize("idt", [dpt.complex64, dpt.complex128]) +def test_searchsorted_contig_cfp(idt): + q = get_queue_or_skip() + skip_if_dtype_not_supported(idt, q) + + dt = dpt.dtype(idt) + + hay_stack = dpt.linspace(0, 1, num=255, dtype=dt, endpoint=True) + hay_stack = _add_extended_cfp(hay_stack) + needles_np = np.random.uniform(-0.1, 1.1, size=1024).astype(dt) + needles = dpt.asarray(needles_np) + + _check(hay_stack, needles, needles_np) + _check( + hay_stack, + dpt.reshape(needles, (32, 32)), + np.reshape(needles_np, (32, 32)), + ) + + +@pytest.mark.parametrize("idt", [dpt.complex64, dpt.complex128]) +def test_searchsorted_strided_cfp(idt): + q = get_queue_or_skip() + skip_if_dtype_not_supported(idt, q) + + dt = dpt.dtype(idt) + + hay_stack = dpt.repeat( + dpt.linspace(0, 1, num=255, dtype=dt, endpoint=True), 4 + )[1::4] + needles_np = np.random.uniform(-0.1, 1.1, size=3 * 1024).astype(dt) + needles = dpt.asarray(needles_np) + sl = slice(1, None, 3) + + _check(hay_stack, needles[sl], needles_np[sl]) + _check( + hay_stack, + dpt.reshape(needles[sl], (32, 32)), + np.reshape(needles_np[sl], (32, 32)), + ) + + hay_stack = _add_extended_cfp(hay_stack) + _check(hay_stack, needles[sl], needles_np[sl]) + _check( + hay_stack, + dpt.reshape(needles[sl], (32, 32)), + np.reshape(needles_np[sl], (32, 32)), + ) + + +def test_searchsorted_coerce(): + get_queue_or_skip() + + x1_i4 = dpt.arange(5, dtype="i4") + x1_i8 = dpt.arange(5, dtype="i8") + x2_i4 = dpt.arange(5, dtype="i4") + x2_i8 = dpt.arange(5, dtype="i8") + + p1 = dpt.searchsorted(x1_i4, x2_i8) + p2 = dpt.searchsorted(x1_i8, x2_i8) + p3 = dpt.searchsorted(x1_i8, x2_i4) + assert dpt.all(p1 == p2) + assert dpt.all(p2 == p3) + + +def test_searchsorted_validation(): + with pytest.raises(TypeError): + dpt.searchsorted(None, None) + try: + x1 = dpt.arange(10, dtype="i4") + except dpctl.SyclDeviceCreationError: + pytest.skip("Default device could not be created") + with pytest.raises(TypeError): + dpt.searchsorted(x1, None) + with pytest.raises(TypeError): + dpt.searchsorted(x1, x1, sorter=dict()) + with pytest.raises(ValueError): + dpt.searchsorted(x1, x1, side="unknown") + + +def test_searchsorted_validation2(): + try: + x1 = dpt.arange(10, dtype="i4") + sorter = dpt.arange(10, dtype="i4") + except dpctl.SyclDeviceCreationError: + pytest.skip("Default device could not be created") + d = x1.sycl_device + q2 = dpctl.SyclQueue(d, property="in_order") + x2 = dpt.ones(5, dtype=x1.dtype, sycl_queue=q2) + + with pytest.raises(dpt.ExecutionPlacementError): + dpt.searchsorted(x1, x2) + + with pytest.raises(dpt.ExecutionPlacementError): + dpt.searchsorted(x1, x2, sorter=sorter) + + sorter = dpt.ones(x1.shape, dtype=dpt.bool) + # non-integral sorter.dtype raises + with pytest.raises(ValueError): + dpt.searchsorted(x1, x1, sorter=sorter) + + # non-matching x1.shape and sorter.shape raises + with pytest.raises(ValueError): + dpt.searchsorted(x1, x1, sorter=sorter[:-1]) + + # x1 must be 1d, or ValueError is raised + with pytest.raises(ValueError): + dpt.searchsorted(x1[dpt.newaxis, :], x1) + + +def test_pw_linear_interpolation_example(): + get_queue_or_skip() + + bins = dpt.asarray([0.0, 0.05, 0.2, 0.25, 0.5, 0.8, 0.95, 1]) + vals = dpt.asarray([0.1, 0.15, 0.3, 0.5, 0.7, 0.53, 0.37, 0.1]) + assert vals.shape == bins.shape + data_np = np.random.uniform(0, 1, size=10000) + data = dpt.asarray(data_np) + + p = dpt.searchsorted(bins, data) + w = (data - bins[p]) / (bins[p - 1] - bins[p]) + assert dpt.min(w) >= 0 + assert dpt.max(w) <= 1 + interp_vals = vals[p - 1] * w + (1 - w) * vals[p] + + assert interp_vals.shape == data.shape + assert dpt.min(interp_vals) >= dpt.zeros(tuple()) + av = dpt.sum(interp_vals) / data.size + exp = dpt.vecdot(vals[1:] + vals[:-1], bins[1:] - bins[:-1]) / 2 + + assert dpt.abs(av - exp) < 0.1 + + +def test_out_of_bound_sorter_values(): + get_queue_or_skip() + + x = dpt.asarray([1, 2, 0], dtype="i4") + n = x.shape[0] + + # use out-of-bounds indices in sorter + sorter = dpt.asarray([2, 0 - n, 1 - n], dtype="i8") + + x2 = dpt.arange(3, dtype=x.dtype) + p = dpt.searchsorted(x, x2, sorter=sorter) + # verify that they were applied with mode="wrap" + assert dpt.all(p == dpt.arange(3, dtype=p.dtype)) + + +def test_searchsorted_strided_scalar_needle(): + get_queue_or_skip() + + a_max = 255 + + hay_stack = dpt.flip( + dpt.repeat(dpt.arange(a_max - 1, -1, -1, dtype=dpt.int32), 4) + ) + needles_np = np.squeeze( + np.random.randint(0, a_max, dtype=dpt.int32, size=1), axis=0 + ) + needles = dpt.asarray(needles_np) + + _check(hay_stack, needles, needles_np) diff --git a/dpnp/tests/tensor/test_usm_ndarray_sorting.py b/dpnp/tests/tensor/test_usm_ndarray_sorting.py new file mode 100644 index 000000000000..af96811bf2f9 --- /dev/null +++ b/dpnp/tests/tensor/test_usm_ndarray_sorting.py @@ -0,0 +1,397 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import itertools + +import numpy as np +import pytest +from numpy.testing import assert_array_equal + +import dpnp.tensor as dpt + +from .helper import ( + get_queue_or_skip, + skip_if_dtype_not_supported, +) + + +@pytest.mark.parametrize( + "dtype", + [ + "i1", + "u1", + "i2", + "u2", + "i4", + "u4", + "i8", + "u8", + "f2", + "f4", + "f8", + "c8", + "c16", + ], +) +def test_sort_1d(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + inp = dpt.roll( + dpt.concat( + (dpt.ones(10000, dtype=dtype), dpt.zeros(10000, dtype=dtype)) + ), + 734, + ) + + s = dpt.sort(inp, descending=False) + assert dpt.all(s[:-1] <= s[1:]) + + s1 = dpt.sort(inp, descending=True) + assert dpt.all(s1[:-1] >= s1[1:]) + + +@pytest.mark.parametrize( + "dtype", + [ + "i1", + "u1", + "i2", + "u2", + "i4", + "u4", + "i8", + "u8", + "f2", + "f4", + "f8", + "c8", + "c16", + ], +) +def test_sort_2d(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + fl = dpt.roll( + dpt.concat( + (dpt.ones(10000, dtype=dtype), dpt.zeros(10000, dtype=dtype)) + ), + 734, + ) + inp = dpt.reshape(fl, (20, -1)) + + s = dpt.sort(inp, axis=1, descending=False) + assert dpt.all(s[:, :-1] <= s[:, 1:]) + + s1 = dpt.sort(inp, axis=1, descending=True) + assert dpt.all(s1[:, :-1] >= s1[:, 1:]) + + +def test_sort_strides(): + get_queue_or_skip() + + fl = dpt.roll( + dpt.concat((dpt.ones(10000, dtype="i4"), dpt.zeros(10000, dtype="i4"))), + 734, + ) + inp = dpt.reshape(fl, (-1, 20)) + + s = dpt.sort(inp, axis=0, descending=False) + assert dpt.all(s[:-1, :] <= s[1:, :]) + + s1 = dpt.sort(inp, axis=0, descending=True) + assert dpt.all(s1[:-1, :] >= s1[1:, :]) + + +@pytest.mark.parametrize( + "dtype", + [ + "i1", + "u1", + "i2", + "u2", + "i4", + "u4", + "i8", + "u8", + "f2", + "f4", + "f8", + "c8", + "c16", + ], +) +def test_argsort_1d(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + inp = dpt.roll( + dpt.concat( + (dpt.ones(10000, dtype=dtype), dpt.zeros(10000, dtype=dtype)) + ), + 734, + ) + + s_idx = dpt.argsort(inp, descending=False) + assert dpt.all(inp[s_idx[:-1]] <= inp[s_idx[1:]]) + + s1_idx = dpt.argsort(inp, descending=True) + assert dpt.all(inp[s1_idx[:-1]] >= inp[s1_idx[1:]]) + + +def test_sort_validation(): + with pytest.raises(TypeError): + dpt.sort(dict()) + + +def test_sort_validation_kind(): + get_queue_or_skip() + + x = dpt.ones(128, dtype="u1") + + with pytest.raises(ValueError): + dpt.sort(x, kind=Ellipsis) + + with pytest.raises(ValueError): + dpt.sort(x, kind="invalid") + + +def test_argsort_validation(): + with pytest.raises(TypeError): + dpt.argsort(dict()) + + +def test_argsort_validation_kind(): + get_queue_or_skip() + + x = dpt.arange(127, stop=0, step=-1, dtype="i1") + + with pytest.raises(ValueError): + dpt.argsort(x, kind=Ellipsis) + + with pytest.raises(ValueError): + dpt.argsort(x, kind="invalid") + + +_all_kinds = ["stable", "mergesort", "radixsort"] + + +@pytest.mark.parametrize("kind", _all_kinds) +def test_sort_axis0(kind): + get_queue_or_skip() + + n, m = 200, 30 + xf = dpt.arange(n * m, 0, step=-1, dtype="i4") + x = dpt.reshape(xf, (n, m)) + s = dpt.sort(x, axis=0, kind=kind) + + assert dpt.all(s[:-1, :] <= s[1:, :]) + + +@pytest.mark.parametrize("kind", _all_kinds) +def test_argsort_axis0(kind): + get_queue_or_skip() + + n, m = 200, 30 + xf = dpt.arange(n * m, 0, step=-1, dtype="i4") + x = dpt.reshape(xf, (n, m)) + idx = dpt.argsort(x, axis=0, kind=kind) + + s = dpt.take_along_axis(x, idx, axis=0) + + assert dpt.all(s[:-1, :] <= s[1:, :]) + + +@pytest.mark.parametrize("kind", _all_kinds) +def test_argsort_axis1(kind): + get_queue_or_skip() + + n, m = 200, 30 + xf = dpt.arange(n * m, 0, step=-1, dtype="i4") + x = dpt.reshape(xf, (n, m)) + idx = dpt.argsort(x, axis=1, kind=kind) + + s = dpt.take_along_axis(x, idx, axis=1) + + assert dpt.all(s[:, :-1] <= s[:, 1:]) + + +@pytest.mark.parametrize("kind", _all_kinds) +def test_sort_strided(kind): + get_queue_or_skip() + + x_orig = dpt.arange(100, dtype="i4") + x_flipped = dpt.flip(x_orig, axis=0) + s = dpt.sort(x_flipped, kind=kind) + + assert dpt.all(s == x_orig) + + +@pytest.mark.parametrize("kind", _all_kinds) +def test_argsort_strided(kind): + get_queue_or_skip() + + x_orig = dpt.arange(100, dtype="i4") + x_flipped = dpt.flip(x_orig, axis=0) + idx = dpt.argsort(x_flipped, kind=kind) + s = dpt.take_along_axis(x_flipped, idx, axis=0) + + assert dpt.all(s == x_orig) + + +@pytest.mark.parametrize("kind", _all_kinds) +def test_sort_0d_array(kind): + get_queue_or_skip() + + x = dpt.asarray(1, dtype="i4") + expected = dpt.asarray(1, dtype="i4") + assert dpt.sort(x, kind=kind) == expected + + +@pytest.mark.parametrize("kind", _all_kinds) +def test_argsort_0d_array(kind): + get_queue_or_skip() + + x = dpt.asarray(1, dtype="i4") + expected = dpt.asarray(0, dtype="i4") + assert dpt.argsort(x, kind=kind) == expected + + +@pytest.mark.parametrize( + "dtype", + [ + "f2", + "f4", + "f8", + ], +) +@pytest.mark.parametrize("kind", _all_kinds) +def test_sort_real_fp_nan(dtype, kind): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + x = dpt.asarray( + [-0.0, 0.1, dpt.nan, 0.0, -0.1, dpt.nan, 0.2, -0.3], dtype=dtype + ) + s = dpt.sort(x, kind=kind) + + expected = dpt.asarray( + [-0.3, -0.1, -0.0, 0.0, 0.1, 0.2, dpt.nan, dpt.nan], dtype=dtype + ) + + assert dpt.allclose(s, expected, equal_nan=True) + + s = dpt.sort(x, descending=True, kind=kind) + + expected = dpt.asarray( + [dpt.nan, dpt.nan, 0.2, 0.1, -0.0, 0.0, -0.1, -0.3], dtype=dtype + ) + + assert dpt.allclose(s, expected, equal_nan=True) + + +@pytest.mark.parametrize( + "dtype", + [ + "c8", + "c16", + ], +) +def test_sort_complex_fp_nan(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + rvs = [-0.0, 0.1, 0.0, 0.2, -0.3, dpt.nan] + ivs = [-0.0, 0.1, 0.0, 0.2, -0.3, dpt.nan] + + cv = [] + for rv in rvs: + for iv in ivs: + cv.append(complex(rv, iv)) + + inp = dpt.asarray(cv, dtype=dtype) + s = dpt.sort(inp) + + expected = np.sort(dpt.asnumpy(inp)) + + assert np.allclose(dpt.asnumpy(s), expected, equal_nan=True) + + pairs = [] + for i, j in itertools.permutations(range(inp.shape[0]), 2): + pairs.append([i, j]) + sub_arrs = inp[dpt.asarray(pairs)] + m1 = dpt.asnumpy(dpt.sort(sub_arrs, axis=1)) + m2 = np.sort(dpt.asnumpy(sub_arrs), axis=1) + for k in range(len(pairs)): + i, j = pairs[k] + r1 = m1[k] + r2 = m2[k] + assert np.array_equal( + r1.view(np.int64), r2.view(np.int64) + ), f"Failed for {i} and {j}" + + +def test_radix_sort_size_1_axis(): + get_queue_or_skip() + + x1 = dpt.ones((), dtype="i1") + r1 = dpt.sort(x1, kind="radixsort") + assert_array_equal(dpt.asnumpy(r1), dpt.asnumpy(x1)) + + x2 = dpt.ones([1], dtype="i1") + r2 = dpt.sort(x2, kind="radixsort") + assert_array_equal(dpt.asnumpy(r2), dpt.asnumpy(x2)) + + x3 = dpt.reshape(dpt.arange(10, dtype="i1"), (10, 1)) + r3 = dpt.sort(x3, kind="radixsort") + assert dpt.asnumpy(r3 == x3).all() + + x4 = dpt.reshape(dpt.arange(10, dtype="i1"), (1, 10)) + r4 = dpt.sort(x4, axis=0, kind="radixsort") + assert dpt.asnumpy(r4 == x4).all() + + +def test_radix_argsort_size_1_axis(): + get_queue_or_skip() + + x1 = dpt.ones((), dtype="i1") + r1 = dpt.argsort(x1, kind="radixsort") + assert r1 == 0 + + x2 = dpt.ones([1], dtype="i1") + r2 = dpt.argsort(x2, kind="radixsort") + assert dpt.asnumpy(r2 == 0).all() + + x3 = dpt.reshape(dpt.arange(10, dtype="i1"), (10, 1)) + r3 = dpt.argsort(x3, kind="radixsort") + assert dpt.asnumpy(r3 == 0).all() + + x4 = dpt.reshape(dpt.arange(10, dtype="i1"), (1, 10)) + r4 = dpt.argsort(x4, axis=0, kind="radixsort") + assert dpt.asnumpy(r4 == 0).all() diff --git a/dpnp/tests/tensor/test_usm_ndarray_top_k.py b/dpnp/tests/tensor/test_usm_ndarray_top_k.py new file mode 100644 index 000000000000..1c04c1fff57a --- /dev/null +++ b/dpnp/tests/tensor/test_usm_ndarray_top_k.py @@ -0,0 +1,331 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import pytest + +import dpnp.tensor as dpt + +from .helper import ( + get_queue_or_skip, + skip_if_dtype_not_supported, +) + + +def _expected_largest_inds(inp, n, shift, k): + "Computed expected top_k indices for mode='largest'" + assert k < n + ones_start_id = shift % (2 * n) + + alloc_dev = inp.device + + if ones_start_id < n: + expected_inds = dpt.arange( + ones_start_id, ones_start_id + k, dtype="i8", device=alloc_dev + ) + else: + # wrap-around + ones_end_id = (ones_start_id + n) % (2 * n) + if ones_end_id >= k: + expected_inds = dpt.arange(k, dtype="i8", device=alloc_dev) + else: + expected_inds = dpt.concat( + ( + dpt.arange(ones_end_id, dtype="i8", device=alloc_dev), + dpt.arange( + ones_start_id, + ones_start_id + k - ones_end_id, + dtype="i8", + device=alloc_dev, + ), + ) + ) + + return expected_inds + + +@pytest.mark.parametrize( + "dtype", + [ + "i1", + "u1", + "i2", + "u2", + "i4", + "u4", + "i8", + "u8", + "f2", + "f4", + "f8", + "c8", + "c16", + ], +) +@pytest.mark.parametrize("n", [33, 43, 255, 511, 1021, 8193]) +def test_top_k_1d_largest(dtype, n): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + shift, k = 734, 5 + o = dpt.ones(n, dtype=dtype) + z = dpt.zeros(n, dtype=dtype) + oz = dpt.concat((o, z)) + inp = dpt.roll(oz, shift) + + expected_inds = _expected_largest_inds(oz, n, shift, k) + + s = dpt.top_k(inp, k, mode="largest") + assert s.values.shape == (k,) + assert s.values.dtype == inp.dtype + assert s.indices.shape == (k,) + assert dpt.all(s.values == dpt.ones(k, dtype=dtype)), s.values + assert dpt.all(s.values == inp[s.indices]), s.indices + assert dpt.all(s.indices == expected_inds), (s.indices, expected_inds) + + +def _expected_smallest_inds(inp, n, shift, k): + "Computed expected top_k indices for mode='smallest'" + assert k < n + zeros_start_id = (n + shift) % (2 * n) + zeros_end_id = (shift) % (2 * n) + + alloc_dev = inp.device + + if zeros_start_id < zeros_end_id: + expected_inds = dpt.arange( + zeros_start_id, zeros_start_id + k, dtype="i8", device=alloc_dev + ) + else: + if zeros_end_id >= k: + expected_inds = dpt.arange(k, dtype="i8", device=alloc_dev) + else: + expected_inds = dpt.concat( + ( + dpt.arange(zeros_end_id, dtype="i8", device=alloc_dev), + dpt.arange( + zeros_start_id, + zeros_start_id + k - zeros_end_id, + dtype="i8", + device=alloc_dev, + ), + ) + ) + + return expected_inds + + +@pytest.mark.parametrize( + "dtype", + [ + "i1", + "u1", + "i2", + "u2", + "i4", + "u4", + "i8", + "u8", + "f2", + "f4", + "f8", + "c8", + "c16", + ], +) +@pytest.mark.parametrize("n", [37, 39, 61, 255, 257, 513, 1021, 8193]) +def test_top_k_1d_smallest(dtype, n): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + shift, k = 734, 5 + o = dpt.ones(n, dtype=dtype) + z = dpt.zeros(n, dtype=dtype) + oz = dpt.concat((o, z)) + inp = dpt.roll(oz, shift) + + expected_inds = _expected_smallest_inds(oz, n, shift, k) + + s = dpt.top_k(inp, k, mode="smallest") + assert s.values.shape == (k,) + assert s.values.dtype == inp.dtype + assert s.indices.shape == (k,) + assert dpt.all(s.values == dpt.zeros(k, dtype=dtype)), s.values + assert dpt.all(s.values == inp[s.indices]), s.indices + assert dpt.all(s.indices == expected_inds), (s.indices, expected_inds) + + +@pytest.mark.parametrize( + "dtype", + [ + # skip short types to ensure that m*n can be represented + # in the type + "i4", + "u4", + "i8", + "u8", + "f2", + "f4", + "f8", + "c8", + "c16", + ], +) +@pytest.mark.parametrize("n", [37, 39, 61, 255, 257, 513, 1021, 8193]) +def test_top_k_2d_largest(dtype, n): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + m, k = 8, 3 + if dtype == "f2" and m * n > 2000: + pytest.skip( + "f2 can not distinguish between large integers used in this test" + ) + + x = dpt.reshape(dpt.arange(m * n, dtype=dtype), (m, n)) + + r = dpt.top_k(x, k, axis=1) + + assert r.values.shape == (m, k) + assert r.indices.shape == (m, k) + expected_inds = dpt.reshape(dpt.arange(n, dtype=r.indices.dtype), (1, n))[ + :, -k: + ] + assert expected_inds.shape == (1, k) + assert dpt.all( + dpt.sort(r.indices, axis=1) == dpt.sort(expected_inds, axis=1) + ), (r.indices, expected_inds) + expected_vals = x[:, -k:] + assert dpt.all( + dpt.sort(r.values, axis=1) == dpt.sort(expected_vals, axis=1) + ) + + +@pytest.mark.parametrize( + "dtype", + [ + # skip short types to ensure that m*n can be represented + # in the type + "i4", + "u4", + "i8", + "u8", + "f2", + "f4", + "f8", + "c8", + "c16", + ], +) +@pytest.mark.parametrize("n", [37, 39, 61, 255, 257, 513, 1021, 8193]) +def test_top_k_2d_smallest(dtype, n): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + m, k = 8, 3 + if dtype == "f2" and m * n > 2000: + pytest.skip( + "f2 can not distinguish between large integers used in this test" + ) + + x = dpt.reshape(dpt.arange(m * n, dtype=dtype), (m, n)) + + r = dpt.top_k(x, k, axis=1, mode="smallest") + + assert r.values.shape == (m, k) + assert r.indices.shape == (m, k) + expected_inds = dpt.reshape(dpt.arange(n, dtype=r.indices.dtype), (1, n))[ + :, :k + ] + assert dpt.all( + dpt.sort(r.indices, axis=1) == dpt.sort(expected_inds, axis=1) + ) + assert dpt.all(dpt.sort(r.values, axis=1) == dpt.sort(x[:, :k], axis=1)) + + +def test_top_k_0d(): + get_queue_or_skip() + + a = dpt.ones(tuple(), dtype="i4") + assert a.ndim == 0 + assert a.size == 1 + + r = dpt.top_k(a, 1) + assert r.values == a + assert r.indices == dpt.zeros_like(a, dtype=r.indices.dtype) + + +def test_top_k_noncontig(): + get_queue_or_skip() + + a = dpt.arange(256, dtype=dpt.int32)[::2] + r = dpt.top_k(a, 3) + + assert dpt.all(dpt.sort(r.values) == dpt.asarray([250, 252, 254])), r.values + assert dpt.all( + dpt.sort(r.indices) == dpt.asarray([125, 126, 127]) + ), r.indices + + +def test_top_k_axis0(): + get_queue_or_skip() + + m, n, k = 128, 8, 3 + x = dpt.reshape(dpt.arange(m * n, dtype=dpt.int32), (m, n)) + + r = dpt.top_k(x, k, axis=0, mode="smallest") + assert r.values.shape == (k, n) + assert r.indices.shape == (k, n) + expected_inds = dpt.reshape(dpt.arange(m, dtype=r.indices.dtype), (m, 1))[ + :k, : + ] + assert dpt.all( + dpt.sort(r.indices, axis=0) == dpt.sort(expected_inds, axis=0) + ) + assert dpt.all(dpt.sort(r.values, axis=0) == dpt.sort(x[:k, :], axis=0)) + + +def test_top_k_validation(): + get_queue_or_skip() + x = dpt.ones(10, dtype=dpt.int64) + with pytest.raises(ValueError): + # k must be positive + dpt.top_k(x, -1) + with pytest.raises(TypeError): + # argument should be usm_ndarray + dpt.top_k(list(), 2) + x2 = dpt.reshape(x, (2, 5)) + with pytest.raises(ValueError): + # k must not exceed array dimension + # along specified axis + dpt.top_k(x2, 100, axis=1) + with pytest.raises(ValueError): + # for 0d arrays, k must be 1 + dpt.top_k(x[0], 2) + with pytest.raises(ValueError): + # mode must be "largest", or "smallest" + dpt.top_k(x, 2, mode="invalid") diff --git a/dpnp/tests/tensor/test_usm_ndarray_unique.py b/dpnp/tests/tensor/test_usm_ndarray_unique.py new file mode 100644 index 000000000000..d602c0346f5d --- /dev/null +++ b/dpnp/tests/tensor/test_usm_ndarray_unique.py @@ -0,0 +1,361 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import dpctl +import pytest + +import dpnp.tensor as dpt + +from .helper import ( + get_queue_or_skip, + skip_if_dtype_not_supported, +) + + +@pytest.mark.parametrize( + "dtype", + [ + "i1", + "u1", + "i2", + "u2", + "i4", + "u4", + "i8", + "u8", + "f2", + "f4", + "f8", + "c8", + "c16", + ], +) +def test_unique_values(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + n, roll = 10000, 734 + inp = dpt.roll( + dpt.concat((dpt.ones(n, dtype=dtype), dpt.zeros(n, dtype=dtype))), + roll, + ) + + uv = dpt.unique_values(inp) + assert dpt.all(uv == dpt.arange(2, dtype=dtype)) + + +def test_unique_values_strided(): + get_queue_or_skip() + + n, m = 1000, 20 + inp = dpt.ones((n, m), dtype="i4", order="F") + inp[:, ::2] = 0 + + uv = dpt.unique_values(inp) + assert dpt.all(uv == dpt.arange(2, dtype="i4")) + + inp = dpt.reshape(inp, -1) + inp = dpt.flip(dpt.reshape(inp, -1)) + + uv = dpt.unique_values(inp) + assert dpt.all(uv == dpt.arange(2, dtype="i4")) + + +@pytest.mark.parametrize( + "dtype", + [ + "i1", + "u1", + "i2", + "u2", + "i4", + "u4", + "i8", + "u8", + "f2", + "f4", + "f8", + "c8", + "c16", + ], +) +def test_unique_counts(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + n, roll = 10000, 734 + inp = dpt.roll( + dpt.concat((dpt.ones(n, dtype=dtype), dpt.zeros(n, dtype=dtype))), + roll, + ) + + uv, uv_counts = dpt.unique_counts(inp) + assert dpt.all(uv == dpt.arange(2, dtype=dtype)) + assert dpt.all(uv_counts == dpt.full(2, n, dtype=uv_counts.dtype)) + + +def test_unique_counts_strided(): + get_queue_or_skip() + + n, m = 1000, 20 + inp = dpt.ones((n, m), dtype="i4", order="F") + inp[:, ::2] = 0 + + uv, uv_counts = dpt.unique_counts(inp) + assert dpt.all(uv == dpt.arange(2, dtype="i4")) + assert dpt.all(uv_counts == dpt.full(2, n / 2 * m, dtype=uv_counts.dtype)) + + inp = dpt.flip(dpt.reshape(inp, -1)) + + uv, uv_counts = dpt.unique_counts(inp) + assert dpt.all(uv == dpt.arange(2, dtype="i4")) + assert dpt.all(uv_counts == dpt.full(2, n / 2 * m, dtype=uv_counts.dtype)) + + +@pytest.mark.parametrize( + "dtype", + [ + "i1", + "u1", + "i2", + "u2", + "i4", + "u4", + "i8", + "u8", + "f2", + "f4", + "f8", + "c8", + "c16", + ], +) +def test_unique_inverse(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + n, roll = 10000, 734 + inp = dpt.roll( + dpt.concat((dpt.ones(n, dtype=dtype), dpt.zeros(n, dtype=dtype))), + roll, + ) + + uv, inv = dpt.unique_inverse(inp) + assert dpt.all(uv == dpt.arange(2, dtype=dtype)) + assert dpt.all(inp == uv[inv]) + assert inp.shape == inv.shape + + +def test_unique_inverse_strided(): + get_queue_or_skip() + + n, m = 1000, 20 + inp = dpt.ones((n, m), dtype="i4", order="F") + inp[:, ::2] = 0 + + uv, inv = dpt.unique_inverse(inp) + assert dpt.all(uv == dpt.arange(2, dtype="i4")) + assert dpt.all(inp == uv[inv]) + assert inp.shape == inv.shape + + inp = dpt.flip(dpt.reshape(inp, -1)) + + uv, inv = dpt.unique_inverse(inp) + assert dpt.all(uv == dpt.arange(2, dtype="i4")) + assert dpt.all(inp == uv[inv]) + assert inp.shape == inv.shape + + +@pytest.mark.parametrize( + "dtype", + [ + "i1", + "u1", + "i2", + "u2", + "i4", + "u4", + "i8", + "u8", + "f2", + "f4", + "f8", + "c8", + "c16", + ], +) +def test_unique_all(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + n, roll = 10000, 734 + inp = dpt.roll( + dpt.concat((dpt.ones(n, dtype=dtype), dpt.zeros(n, dtype=dtype))), + roll, + ) + + uv, ind, inv, uv_counts = dpt.unique_all(inp) + assert dpt.all(uv == dpt.arange(2, dtype=dtype)) + assert dpt.all(uv == inp[ind]) + assert dpt.all(inp == uv[inv]) + assert inp.shape == inv.shape + assert dpt.all(uv_counts == dpt.full(2, n, dtype=uv_counts.dtype)) + + +def test_unique_all_strided(): + get_queue_or_skip() + + n, m = 1000, 20 + inp = dpt.ones((n, m), dtype="i4", order="F") + inp[:, ::2] = 0 + + uv, ind, inv, uv_counts = dpt.unique_all(inp) + assert dpt.all(uv == dpt.arange(2, dtype="i4")) + assert dpt.all(uv == dpt.reshape(inp, -1)[ind]) + assert dpt.all(inp == uv[inv]) + assert inp.shape == inv.shape + assert dpt.all(uv_counts == dpt.full(2, n / 2 * m, dtype=uv_counts.dtype)) + + inp = dpt.flip(dpt.reshape(inp, -1)) + + uv, ind, inv, uv_counts = dpt.unique_all(inp) + assert dpt.all(uv == dpt.arange(2, dtype="i4")) + assert dpt.all(uv == inp[ind]) + assert dpt.all(inp == uv[inv]) + assert inp.shape == inv.shape + assert dpt.all(uv_counts == dpt.full(2, n / 2 * m, dtype=uv_counts.dtype)) + + +def test_set_functions_empty_input(): + get_queue_or_skip() + x = dpt.ones((10, 0, 1), dtype="i4") + + res = dpt.unique_values(x) + assert isinstance(res, dpt.usm_ndarray) + assert res.size == 0 + assert res.dtype == x.dtype + + res = dpt.unique_inverse(x) + assert type(res).__name__ == "UniqueInverseResult" + uv, inv = res + assert isinstance(uv, dpt.usm_ndarray) + assert uv.size == 0 + assert isinstance(inv, dpt.usm_ndarray) + assert inv.size == 0 + + res = dpt.unique_counts(x) + assert type(res).__name__ == "UniqueCountsResult" + uv, uv_counts = res + assert isinstance(uv, dpt.usm_ndarray) + assert uv.size == 0 + assert isinstance(uv_counts, dpt.usm_ndarray) + assert uv_counts.size == 0 + + res = dpt.unique_all(x) + assert type(res).__name__ == "UniqueAllResult" + uv, ind, inv, uv_counts = res + assert isinstance(uv, dpt.usm_ndarray) + assert uv.size == 0 + assert isinstance(ind, dpt.usm_ndarray) + assert ind.size == 0 + assert isinstance(inv, dpt.usm_ndarray) + assert inv.size == 0 + assert isinstance(uv_counts, dpt.usm_ndarray) + assert uv_counts.size == 0 + + +def test_set_function_outputs(): + get_queue_or_skip() + # check standard and early exit paths + x1 = dpt.arange(10, dtype="i4") + x2 = dpt.ones((10, 10), dtype="i4") + + assert isinstance(dpt.unique_values(x1), dpt.usm_ndarray) + assert isinstance(dpt.unique_values(x2), dpt.usm_ndarray) + + assert type(dpt.unique_inverse(x1)).__name__ == "UniqueInverseResult" + assert type(dpt.unique_inverse(x2)).__name__ == "UniqueInverseResult" + + assert type(dpt.unique_counts(x1)).__name__ == "UniqueCountsResult" + assert type(dpt.unique_counts(x2)).__name__ == "UniqueCountsResult" + + assert type(dpt.unique_all(x1)).__name__ == "UniqueAllResult" + assert type(dpt.unique_all(x2)).__name__ == "UniqueAllResult" + + +def test_set_functions_compute_follows_data(): + # tests that all intermediate calls and allocations + # are compatible with an input with an arbitrary queue + get_queue_or_skip() + q = dpctl.SyclQueue() + x = dpt.arange(10, dtype="i4", sycl_queue=q) + + uv = dpt.unique_values(x) + assert isinstance(uv, dpt.usm_ndarray) + assert uv.sycl_queue == q + uv, uc = dpt.unique_counts(x) + assert isinstance(uv, dpt.usm_ndarray) + assert isinstance(uc, dpt.usm_ndarray) + assert uv.sycl_queue == q + assert uc.sycl_queue == q + uv, inv_ind = dpt.unique_inverse(x) + assert isinstance(uv, dpt.usm_ndarray) + assert isinstance(inv_ind, dpt.usm_ndarray) + assert uv.sycl_queue == q + assert inv_ind.sycl_queue == q + uv, ind, inv_ind, uc = dpt.unique_all(x) + assert isinstance(uv, dpt.usm_ndarray) + assert isinstance(ind, dpt.usm_ndarray) + assert isinstance(inv_ind, dpt.usm_ndarray) + assert isinstance(uc, dpt.usm_ndarray) + assert uv.sycl_queue == q + assert ind.sycl_queue == q + assert inv_ind.sycl_queue == q + assert uc.sycl_queue == q + + +def test_gh_1738(): + get_queue_or_skip() + + ones = dpt.ones(10, dtype="i8") + iota = dpt.arange(10, dtype="i8") + + assert ones.device == iota.device + + dpt_info = dpt.__array_namespace_info__() + ind_dt = dpt_info.default_dtypes(device=ones.device)["indexing"] + + dt = dpt.unique_inverse(ones).inverse_indices.dtype + assert dt == ind_dt + dt = dpt.unique_all(ones).inverse_indices.dtype + assert dt == ind_dt + + dt = dpt.unique_inverse(iota).inverse_indices.dtype + assert dt == ind_dt + dt = dpt.unique_all(iota).inverse_indices.dtype + assert dt == ind_dt diff --git a/dpnp/tests/tensor/test_usm_ndarray_utility_functions.py b/dpnp/tests/tensor/test_usm_ndarray_utility_functions.py new file mode 100644 index 000000000000..b6d6293ade73 --- /dev/null +++ b/dpnp/tests/tensor/test_usm_ndarray_utility_functions.py @@ -0,0 +1,199 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +from random import randrange + +import numpy as np +import pytest +from numpy.testing import assert_array_equal, assert_equal + +import dpnp.tensor as dpt +from dpnp.tensor._numpy_helper import AxisError + +from .helper import ( + get_queue_or_skip, + skip_if_dtype_not_supported, +) + +_all_dtypes = [ + "?", + "i1", + "u1", + "i2", + "u2", + "i4", + "u4", + "i8", + "u8", + "f2", + "f4", + "f8", + "c8", + "c16", +] + + +@pytest.mark.parametrize("func,identity", [(dpt.all, True), (dpt.any, False)]) +@pytest.mark.parametrize("dtype", _all_dtypes) +def test_boolean_reduction_dtypes_contig(func, identity, dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + x = dpt.full(10, identity, dtype=dtype, sycl_queue=q) + res = func(x) + + assert_equal(dpt.asnumpy(res), identity) + + x[randrange(x.size)] = not identity + res = func(x) + assert_equal(dpt.asnumpy(res), not identity) + + # test branch in kernel for large arrays + wg_size = 4 * 32 + x = dpt.full((wg_size + 1), identity, dtype=dtype, sycl_queue=q) + res = func(x) + assert_equal(dpt.asnumpy(res), identity) + + x[randrange(x.size)] = not identity + res = func(x) + assert_equal(dpt.asnumpy(res), not identity) + + +@pytest.mark.parametrize("func,identity", [(dpt.all, True), (dpt.any, False)]) +@pytest.mark.parametrize("dtype", _all_dtypes) +def test_boolean_reduction_dtypes_strided(func, identity, dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + x = dpt.full(20, identity, dtype=dtype, sycl_queue=q)[::-2] + res = func(x) + assert_equal(dpt.asnumpy(res), identity) + + x[randrange(x.size)] = not identity + res = func(x) + assert_equal(dpt.asnumpy(res), not identity) + + +@pytest.mark.parametrize("func,identity", [(dpt.all, True), (dpt.any, False)]) +def test_boolean_reduction_axis(func, identity): + get_queue_or_skip() + + x = dpt.full((2, 3, 4, 5, 6), identity, dtype="i4") + res = func(x, axis=(1, 2, -1)) + + assert res.shape == (2, 5) + assert_array_equal(dpt.asnumpy(res), np.full(res.shape, identity)) + + # make first row of output negation of identity + x[0, 0, 0, ...] = not identity + res = func(x, axis=(1, 2, -1)) + assert_array_equal(dpt.asnumpy(res[0]), np.full(res.shape[1], not identity)) + + +@pytest.mark.parametrize("func", [dpt.all, dpt.any]) +def test_boolean_reduction_keepdims(func): + get_queue_or_skip() + + x = dpt.ones((2, 3, 4, 5, 6), dtype="i4") + res = func(x, axis=(1, 2, -1), keepdims=True) + assert res.shape == (2, 1, 1, 5, 1) + assert_array_equal(dpt.asnumpy(res), np.full(res.shape, True)) + + res = func(x, axis=None, keepdims=True) + assert res.shape == (1,) * x.ndim + + +@pytest.mark.parametrize("func,identity", [(dpt.all, True), (dpt.any, False)]) +def test_boolean_reduction_empty(func, identity): + get_queue_or_skip() + + x = dpt.empty((0,), dtype="i4") + res = func(x) + assert_equal(dpt.asnumpy(res), identity) + + +# nan, inf, and -inf should evaluate to true +@pytest.mark.parametrize("func", [dpt.all, dpt.any]) +def test_boolean_reductions_nan_inf(func): + q = get_queue_or_skip() + + x = dpt.asarray([dpt.nan, dpt.inf, -dpt.inf], dtype="f4", sycl_queue=q)[ + :, dpt.newaxis + ] + res = func(x, axis=1) + assert_array_equal(dpt.asnumpy(res), np.array([True, True, True])) + + +@pytest.mark.parametrize("func", [dpt.all, dpt.any]) +def test_boolean_reduction_scalars(func): + get_queue_or_skip() + + x = dpt.ones((), dtype="i4") + assert_equal(dpt.asnumpy(func(x)), True) + + x = dpt.zeros((), dtype="i4") + assert_equal(dpt.asnumpy(func(x)), False) + + +@pytest.mark.parametrize("func", [dpt.all, dpt.any]) +def test_boolean_reduction_empty_axis(func): + get_queue_or_skip() + + x = dpt.ones((5,), dtype="i4") + res = func(x, axis=()) + assert_array_equal(dpt.asnumpy(res), dpt.asnumpy(x).astype(np.bool_)) + + +@pytest.mark.parametrize("func", [dpt.all, dpt.any]) +def test_arg_validation_boolean_reductions(func): + get_queue_or_skip() + + x = dpt.ones((4, 5), dtype="i4") + d = {} + + with pytest.raises(TypeError): + func(d) + with pytest.raises(AxisError): + func(x, axis=-3) + + +def test_boolean_reductions_3d_gh_1327(): + get_queue_or_skip() + + size = 24 + x = dpt.reshape(dpt.arange(-10, size - 10, 1, dtype="i4"), (2, 3, 4)) + res = dpt.all(x, axis=0) + res_np = np.full(res.shape, True, dtype="?") + res_np[2, 2] = False + + assert (dpt.asnumpy(res) == res_np).all() + + x = dpt.ones((2, 3, 4, 5), dtype="i4") + res = dpt.any(x, axis=0) + + assert (dpt.asnumpy(res) == np.full(res.shape, True, dtype="?")).all() diff --git a/dpnp/tests/test_array_api_info.py b/dpnp/tests/test_array_api_info.py index 0e2fe7dc5a04..9b1f0cc20108 100644 --- a/dpnp/tests/test_array_api_info.py +++ b/dpnp/tests/test_array_api_info.py @@ -1,10 +1,9 @@ -import numpy import pytest from dpctl import get_devices, select_default_device -from dpctl.tensor._tensor_impl import default_device_complex_type import dpnp from dpnp.exceptions import SyclDeviceCreationError +from dpnp.tensor._tensor_impl import default_device_complex_type from dpnp.tests.helper import ( has_support_aspect64, is_win_platform, diff --git a/dpnp/tests/test_arraycreation.py b/dpnp/tests/test_arraycreation.py index d8a80ddbff78..b195c0484105 100644 --- a/dpnp/tests/test_arraycreation.py +++ b/dpnp/tests/test_arraycreation.py @@ -2,7 +2,6 @@ from math import prod import dpctl -import dpctl.tensor as dpt import numpy import pytest from numpy.testing import ( @@ -14,6 +13,7 @@ ) import dpnp +import dpnp.tensor as dpt from .helper import ( assert_dtype_allclose, diff --git a/dpnp/tests/test_arraymanipulation.py b/dpnp/tests/test_arraymanipulation.py index fe74368a8c81..25c454b97613 100644 --- a/dpnp/tests/test_arraymanipulation.py +++ b/dpnp/tests/test_arraymanipulation.py @@ -1,11 +1,9 @@ -import warnings - -import dpctl.tensor as dpt import numpy import pytest from numpy.testing import assert_array_equal, assert_equal, assert_raises import dpnp +import dpnp.tensor as dpt from dpnp.exceptions import AxisError from .helper import get_all_dtypes, get_float_complex_dtypes diff --git a/dpnp/tests/test_cli_options.py b/dpnp/tests/test_cli_options.py new file mode 100644 index 000000000000..0caca95f3974 --- /dev/null +++ b/dpnp/tests/test_cli_options.py @@ -0,0 +1,20 @@ +import subprocess +import sys + + +def test_tensor_includes(): + res = subprocess.run( + [sys.executable, "-m", "dpnp", "--tensor-includes"], + capture_output=True, + ) + assert res.returncode == 0 + assert res.stdout + flags = res.stdout.decode("utf-8") + res = subprocess.run( + [sys.executable, "-m", "dpnp", "--tensor-include-dir"], + capture_output=True, + ) + assert res.returncode == 0 + assert res.stdout + dir = res.stdout.decode("utf-8") + assert flags == "-I " + dir diff --git a/dpnp/tests/test_fft.py b/dpnp/tests/test_fft.py index b10bf1b46016..f8cc95a7a3ca 100644 --- a/dpnp/tests/test_fft.py +++ b/dpnp/tests/test_fft.py @@ -1,10 +1,10 @@ import dpctl -import dpctl.tensor as dpt import numpy import pytest from numpy.testing import assert_raises import dpnp +import dpnp.tensor as dpt from dpnp.dpnp_utils import map_dtype_to_device from dpnp.exceptions import ExecutionPlacementError diff --git a/dpnp/tests/test_indexing.py b/dpnp/tests/test_indexing.py index 27f34f6288b3..d54ae381a386 100644 --- a/dpnp/tests/test_indexing.py +++ b/dpnp/tests/test_indexing.py @@ -1,10 +1,8 @@ import functools import dpctl -import dpctl.tensor as dpt import numpy import pytest -from dpctl.tensor._type_utils import _to_device_supported_dtype from numpy.testing import ( assert_, assert_array_equal, @@ -14,8 +12,10 @@ ) import dpnp +import dpnp.tensor as dpt from dpnp.dpnp_array import dpnp_array from dpnp.exceptions import AxisError, ExecutionPlacementError +from dpnp.tensor._type_utils import _to_device_supported_dtype from .helper import ( generate_random_numpy_array, diff --git a/dpnp/tests/test_linalg.py b/dpnp/tests/test_linalg.py index 20d974b32f0c..74b9122e0d20 100644 --- a/dpnp/tests/test_linalg.py +++ b/dpnp/tests/test_linalg.py @@ -1,7 +1,6 @@ import warnings import dpctl -import dpctl.tensor as dpt import numpy import pytest from numpy.testing import ( @@ -13,6 +12,7 @@ ) import dpnp +import dpnp.tensor as dpt from dpnp.exceptions import AxisError, ExecutionPlacementError from .helper import ( diff --git a/dpnp/tests/test_manipulation.py b/dpnp/tests/test_manipulation.py index c35050afaa86..4fc4b8cb1619 100644 --- a/dpnp/tests/test_manipulation.py +++ b/dpnp/tests/test_manipulation.py @@ -1,6 +1,5 @@ import itertools -import dpctl.tensor as dpt import numpy import pytest from numpy.testing import ( @@ -10,6 +9,7 @@ ) import dpnp +import dpnp.tensor as dpt from dpnp.exceptions import AxisError from .helper import ( diff --git a/dpnp/tests/test_mathematical.py b/dpnp/tests/test_mathematical.py index 511047372a14..8de7ec2ed80d 100644 --- a/dpnp/tests/test_mathematical.py +++ b/dpnp/tests/test_mathematical.py @@ -1,8 +1,6 @@ import dpctl -import dpctl.tensor as dpt import numpy import pytest -from dpctl.tensor._numpy_helper import normalize_axis_index from numpy.testing import ( assert_allclose, assert_array_equal, @@ -12,9 +10,11 @@ ) import dpnp +import dpnp.tensor as dpt from dpnp.dpnp_array import dpnp_array from dpnp.dpnp_utils import map_dtype_to_device from dpnp.exceptions import AxisError, ExecutionPlacementError +from dpnp.tensor._numpy_helper import normalize_axis_index from .helper import ( assert_dtype_allclose, diff --git a/dpnp/tests/test_memory.py b/dpnp/tests/test_memory.py index 6a3d6ac5afae..5f5a9251fd75 100644 --- a/dpnp/tests/test_memory.py +++ b/dpnp/tests/test_memory.py @@ -1,9 +1,9 @@ -import dpctl.tensor as dpt import numpy import pytest import dpnp import dpnp.memory as dpm +import dpnp.tensor as dpt class IntUsmData(dpt.usm_ndarray): diff --git a/dpnp/tests/test_nanfunctions.py b/dpnp/tests/test_nanfunctions.py index 48520015d354..598d1c2678ec 100644 --- a/dpnp/tests/test_nanfunctions.py +++ b/dpnp/tests/test_nanfunctions.py @@ -1,5 +1,4 @@ import dpctl -import dpctl.tensor as dpt import numpy import pytest from numpy.testing import ( @@ -12,6 +11,7 @@ ) import dpnp +import dpnp.tensor as dpt from dpnp.exceptions import ExecutionPlacementError from .helper import ( diff --git a/dpnp/tests/test_ndarray.py b/dpnp/tests/test_ndarray.py index 6ce8645a11d4..5a848c9660fc 100644 --- a/dpnp/tests/test_ndarray.py +++ b/dpnp/tests/test_ndarray.py @@ -1,4 +1,3 @@ -import dpctl.tensor as dpt import numpy import pytest from numpy.testing import ( @@ -10,6 +9,7 @@ ) import dpnp +import dpnp.tensor as dpt from .helper import ( generate_random_numpy_array, @@ -567,6 +567,9 @@ def test_print_dpnp_special_character(character): assert result == expected +# TODO: repr formatting is inconsistent (scientific vs integer-like output) +# This is a minor issue that does not depend on compiler flags +@pytest.mark.skip(reason="SAT-8452") def test_print_dpnp_1d(): dtype = dpnp.default_float_type() result = repr(dpnp.arange(10000, dtype=dtype)) diff --git a/dpnp/tests/test_search.py b/dpnp/tests/test_search.py index 64c4eb75f906..75ce9bdeed20 100644 --- a/dpnp/tests/test_search.py +++ b/dpnp/tests/test_search.py @@ -1,9 +1,9 @@ -import dpctl.tensor as dpt import numpy import pytest from numpy.testing import assert_array_equal, assert_equal, assert_raises import dpnp +import dpnp.tensor as dpt from .helper import ( generate_random_numpy_array, diff --git a/dpnp/tests/test_statistics.py b/dpnp/tests/test_statistics.py index cf436087b607..a02adfac2ecb 100644 --- a/dpnp/tests/test_statistics.py +++ b/dpnp/tests/test_statistics.py @@ -1,5 +1,4 @@ import dpctl -import dpctl.tensor as dpt import numpy import pytest from numpy.testing import ( @@ -10,6 +9,7 @@ ) import dpnp +import dpnp.tensor as dpt from .helper import ( assert_dtype_allclose, diff --git a/dpnp/tests/test_sycl_queue.py b/dpnp/tests/test_sycl_queue.py index e4b9403df8a4..5420285d5940 100644 --- a/dpnp/tests/test_sycl_queue.py +++ b/dpnp/tests/test_sycl_queue.py @@ -2,13 +2,13 @@ import tempfile import dpctl -import dpctl.tensor as dpt import numpy import pytest from numpy.testing import assert_array_equal, assert_raises import dpnp import dpnp.linalg +import dpnp.tensor as dpt from dpnp.dpnp_array import dpnp_array from dpnp.dpnp_utils import get_usm_allocations from dpnp.exceptions import ExecutionPlacementError @@ -50,7 +50,7 @@ def assert_sycl_queue_equal(result, expected): assert result.sycl_device == expected.sycl_device assert result.is_in_order == expected.is_in_order assert result.has_enable_profiling == expected.has_enable_profiling - exec_queue = dpctl.utils.get_execution_queue([result, expected]) + exec_queue = dpt.get_execution_queue([result, expected]) assert exec_queue is not None diff --git a/dpnp/tests/test_usm_type.py b/dpnp/tests/test_usm_type.py index b73eb67d51ee..568cf2a2aff0 100644 --- a/dpnp/tests/test_usm_type.py +++ b/dpnp/tests/test_usm_type.py @@ -2,12 +2,11 @@ import tempfile from math import prod -import dpctl.tensor as dpt -import dpctl.utils as du import numpy import pytest import dpnp +import dpnp.tensor as dpt from dpnp.dpnp_utils import get_usm_allocations from .helper import generate_random_numpy_array @@ -29,7 +28,7 @@ def test_add(usm_type_x, usm_type_y): assert x.usm_type == usm_type_x assert y.usm_type == usm_type_y - assert z.usm_type == du.get_coerced_usm_type([usm_type_x, usm_type_y]) + assert z.usm_type == dpt.get_coerced_usm_type([usm_type_x, usm_type_y]) @pytest.mark.parametrize("usm_type_x", list_of_usm_types) @@ -46,7 +45,7 @@ def test_multiply(usm_type_x, usm_type_y): assert x.usm_type == usm_type_x assert y.usm_type == usm_type_y - assert z.usm_type == du.get_coerced_usm_type([usm_type_x, usm_type_y]) + assert z.usm_type == dpt.get_coerced_usm_type([usm_type_x, usm_type_y]) @pytest.mark.parametrize("usm_type_x", list_of_usm_types) @@ -63,7 +62,7 @@ def test_subtract(usm_type_x, usm_type_y): assert x.usm_type == usm_type_x assert y.usm_type == usm_type_y - assert z.usm_type == du.get_coerced_usm_type([usm_type_x, usm_type_y]) + assert z.usm_type == dpt.get_coerced_usm_type([usm_type_x, usm_type_y]) @pytest.mark.parametrize("usm_type_x", list_of_usm_types) @@ -80,7 +79,7 @@ def test_divide(usm_type_x, usm_type_y): assert x.usm_type == usm_type_x assert y.usm_type == usm_type_y - assert z.usm_type == du.get_coerced_usm_type([usm_type_x, usm_type_y]) + assert z.usm_type == dpt.get_coerced_usm_type([usm_type_x, usm_type_y]) @pytest.mark.parametrize("usm_type_x", list_of_usm_types) @@ -100,7 +99,7 @@ def test_remainder(usm_type_x, usm_type_y): assert x.usm_type == usm_type_x assert y.usm_type == usm_type_y - assert z.usm_type == du.get_coerced_usm_type([usm_type_x, usm_type_y]) + assert z.usm_type == dpt.get_coerced_usm_type([usm_type_x, usm_type_y]) @pytest.mark.parametrize("usm_type_x", list_of_usm_types) @@ -121,7 +120,7 @@ def test_floor_divide(usm_type_x, usm_type_y): assert x.usm_type == usm_type_x assert y.usm_type == usm_type_y - assert z.usm_type == du.get_coerced_usm_type([usm_type_x, usm_type_y]) + assert z.usm_type == dpt.get_coerced_usm_type([usm_type_x, usm_type_y]) @pytest.mark.parametrize("usm_type_x", list_of_usm_types) @@ -136,7 +135,7 @@ def test_power(usm_type_x, usm_type_y): assert x.usm_type == usm_type_x assert y.usm_type == usm_type_y - assert z.usm_type == du.get_coerced_usm_type([usm_type_x, usm_type_y]) + assert z.usm_type == dpt.get_coerced_usm_type([usm_type_x, usm_type_y]) @pytest.mark.parametrize( @@ -320,7 +319,7 @@ def test_linspace_arrays(usm_type_start, usm_type_stop): start = dpnp.array([0, 0], usm_type=usm_type_start) stop = dpnp.array([2, 4], usm_type=usm_type_stop) res = dpnp.linspace(start, stop, 4) - assert res.usm_type == du.get_coerced_usm_type( + assert res.usm_type == dpt.get_coerced_usm_type( [usm_type_start, usm_type_stop] ) @@ -376,7 +375,7 @@ def test_logic_op_2in(op, usm_type_x, usm_type_y): assert x.usm_type == zx.usm_type == usm_type_x assert y.usm_type == zy.usm_type == usm_type_y - assert z.usm_type == du.get_coerced_usm_type([usm_type_x, usm_type_y]) + assert z.usm_type == dpt.get_coerced_usm_type([usm_type_x, usm_type_y]) @pytest.mark.parametrize("op", ["bitwise_count", "bitwise_not"]) @@ -404,7 +403,7 @@ def test_bitwise_op_2in(op, usm_type_x, usm_type_y): assert x.usm_type == zx.usm_type == usm_type_x assert y.usm_type == zy.usm_type == usm_type_y - assert z.usm_type == du.get_coerced_usm_type([usm_type_x, usm_type_y]) + assert z.usm_type == dpt.get_coerced_usm_type([usm_type_x, usm_type_y]) class TestMatmul: @@ -445,7 +444,7 @@ def test_basic(self, usm_type_x, usm_type_y, dtype, shape1, shape2): assert x.usm_type == usm_type_x assert y.usm_type == usm_type_y - assert z.usm_type == du.get_coerced_usm_type([usm_type_x, usm_type_y]) + assert z.usm_type == dpt.get_coerced_usm_type([usm_type_x, usm_type_y]) @pytest.mark.parametrize("usm_type", list_of_usm_types) def test_syrk(self, usm_type): @@ -474,7 +473,7 @@ def test_matvec(usm_type_x, usm_type_y, shape1, shape2): assert x.usm_type == usm_type_x assert y.usm_type == usm_type_y - assert z.usm_type == du.get_coerced_usm_type([usm_type_x, usm_type_y]) + assert z.usm_type == dpt.get_coerced_usm_type([usm_type_x, usm_type_y]) @pytest.mark.parametrize("usm_type_x", list_of_usm_types) @@ -496,7 +495,7 @@ def test_vecdot(usm_type_x, usm_type_y, shape1, shape2): assert x.usm_type == usm_type_x assert y.usm_type == usm_type_y - assert z.usm_type == du.get_coerced_usm_type([usm_type_x, usm_type_y]) + assert z.usm_type == dpt.get_coerced_usm_type([usm_type_x, usm_type_y]) @pytest.mark.parametrize("usm_type_x", list_of_usm_types) @@ -518,7 +517,7 @@ def test_vecmat(usm_type_x, usm_type_y, shape1, shape2): assert x.usm_type == usm_type_x assert y.usm_type == usm_type_y - assert z.usm_type == du.get_coerced_usm_type([usm_type_x, usm_type_y]) + assert z.usm_type == dpt.get_coerced_usm_type([usm_type_x, usm_type_y]) @pytest.mark.parametrize("usm_type_x", list_of_usm_types) @@ -744,7 +743,7 @@ def test_2in_1out(func, data1, data2, usm_type_x, usm_type_y): assert x.usm_type == usm_type_x assert y.usm_type == usm_type_y - assert z.usm_type == du.get_coerced_usm_type([usm_type_x, usm_type_y]) + assert z.usm_type == dpt.get_coerced_usm_type([usm_type_x, usm_type_y]) @pytest.mark.parametrize( @@ -765,7 +764,7 @@ def test_2in_2out(func, data1, data2, usm_type_x, usm_type_y): assert ( z1.usm_type == z2.usm_type - == du.get_coerced_usm_type([usm_type_x, usm_type_y]) + == dpt.get_coerced_usm_type([usm_type_x, usm_type_y]) ) @@ -811,7 +810,7 @@ def test_piecewise(usm_type_x, usm_type_y, usm_type_z): assert x.usm_type == usm_type_x assert y.usm_type == usm_type_y assert z.usm_type == usm_type_z - assert result.usm_type == du.get_coerced_usm_type( + assert result.usm_type == dpt.get_coerced_usm_type( [usm_type_x, usm_type_y, usm_type_z] ) @@ -836,7 +835,7 @@ def test_concat_stack(func, data1, data2, usm_type_x, usm_type_y): assert x.usm_type == usm_type_x assert y.usm_type == usm_type_y - assert z.usm_type == du.get_coerced_usm_type([usm_type_x, usm_type_y]) + assert z.usm_type == dpt.get_coerced_usm_type([usm_type_x, usm_type_y]) @pytest.mark.parametrize("usm_type_x", list_of_usm_types) @@ -848,7 +847,7 @@ def test_extract(usm_type_x, usm_type_y): assert x.usm_type == usm_type_x assert y.usm_type == usm_type_y - assert z.usm_type == du.get_coerced_usm_type([usm_type_x, usm_type_y]) + assert z.usm_type == dpt.get_coerced_usm_type([usm_type_x, usm_type_y]) @pytest.mark.parametrize( @@ -896,7 +895,9 @@ def test_obj_ndarray(self, usm_type, usm_type_other): assert x.usm_type == usm_type assert y.usm_type == usm_type_other - assert z.usm_type == du.get_coerced_usm_type([usm_type, usm_type_other]) + assert z.usm_type == dpt.get_coerced_usm_type( + [usm_type, usm_type_other] + ) @pytest.mark.parametrize("usm_type", list_of_usm_types) @@ -941,7 +942,9 @@ def test_values_ndarray(self, obj, usm_type, usm_type_other): assert x.usm_type == usm_type assert y.usm_type == usm_type_other - assert z.usm_type == du.get_coerced_usm_type([usm_type, usm_type_other]) + assert z.usm_type == dpt.get_coerced_usm_type( + [usm_type, usm_type_other] + ) @pytest.mark.parametrize("values", [-2, [-1, -2]], ids=["scalar", "list"]) @pytest.mark.parametrize("usm_type_other", list_of_usm_types) @@ -952,7 +955,9 @@ def test_obj_ndarray(self, values, usm_type, usm_type_other): assert x.usm_type == usm_type assert y.usm_type == usm_type_other - assert z.usm_type == du.get_coerced_usm_type([usm_type, usm_type_other]) + assert z.usm_type == dpt.get_coerced_usm_type( + [usm_type, usm_type_other] + ) @pytest.mark.parametrize("usm_type_y", list_of_usm_types) @pytest.mark.parametrize("usm_type_z", list_of_usm_types) @@ -965,7 +970,7 @@ def test_obj_values_ndarray(self, usm_type, usm_type_y, usm_type_z): assert x.usm_type == usm_type assert y.usm_type == usm_type_y assert z.usm_type == usm_type_z - assert res.usm_type == du.get_coerced_usm_type( + assert res.usm_type == dpt.get_coerced_usm_type( [usm_type, usm_type_y, usm_type_z] ) @@ -980,7 +985,7 @@ def test_take(func, usm_type_x, usm_type_ind): assert x.usm_type == usm_type_x assert ind.usm_type == usm_type_ind - assert z.usm_type == du.get_coerced_usm_type([usm_type_x, usm_type_ind]) + assert z.usm_type == dpt.get_coerced_usm_type([usm_type_x, usm_type_ind]) @pytest.mark.parametrize( @@ -1004,7 +1009,7 @@ def test_take_along_axis(data, ind, axis, usm_type_x, usm_type_ind): assert x.usm_type == usm_type_x assert ind.usm_type == usm_type_ind - assert z.usm_type == du.get_coerced_usm_type([usm_type_x, usm_type_ind]) + assert z.usm_type == dpt.get_coerced_usm_type([usm_type_x, usm_type_ind]) @pytest.mark.parametrize("usm_type", list_of_usm_types + [None]) @@ -1156,8 +1161,8 @@ def test_histogram(usm_type_v, usm_type_w): hist, edges = dpnp.histogram(v, weights=w) assert v.usm_type == usm_type_v assert w.usm_type == usm_type_w - assert hist.usm_type == du.get_coerced_usm_type([usm_type_v, usm_type_w]) - assert edges.usm_type == du.get_coerced_usm_type([usm_type_v, usm_type_w]) + assert hist.usm_type == dpt.get_coerced_usm_type([usm_type_v, usm_type_w]) + assert edges.usm_type == dpt.get_coerced_usm_type([usm_type_v, usm_type_w]) @pytest.mark.parametrize("usm_type_x", list_of_usm_types) @@ -1172,13 +1177,13 @@ def test_histogram2d(usm_type_x, usm_type_y, usm_type_w): assert x.usm_type == usm_type_x assert y.usm_type == usm_type_y assert w.usm_type == usm_type_w - assert hist.usm_type == du.get_coerced_usm_type( + assert hist.usm_type == dpt.get_coerced_usm_type( [usm_type_x, usm_type_y, usm_type_w] ) - assert edges_x.usm_type == du.get_coerced_usm_type( + assert edges_x.usm_type == dpt.get_coerced_usm_type( [usm_type_x, usm_type_y, usm_type_w] ) - assert edges_y.usm_type == du.get_coerced_usm_type( + assert edges_y.usm_type == dpt.get_coerced_usm_type( [usm_type_x, usm_type_y, usm_type_w] ) @@ -1192,7 +1197,7 @@ def test_bincount(usm_type_v, usm_type_w): hist = dpnp.bincount(v, weights=w) assert v.usm_type == usm_type_v assert w.usm_type == usm_type_w - assert hist.usm_type == du.get_coerced_usm_type([usm_type_v, usm_type_w]) + assert hist.usm_type == dpt.get_coerced_usm_type([usm_type_v, usm_type_w]) @pytest.mark.parametrize("usm_type_v", list_of_usm_types) @@ -1204,9 +1209,9 @@ def test_histogramdd(usm_type_v, usm_type_w): hist, edges = dpnp.histogramdd(v, weights=w) assert v.usm_type == usm_type_v assert w.usm_type == usm_type_w - assert hist.usm_type == du.get_coerced_usm_type([usm_type_v, usm_type_w]) + assert hist.usm_type == dpt.get_coerced_usm_type([usm_type_v, usm_type_w]) for e in edges: - assert e.usm_type == du.get_coerced_usm_type([usm_type_v, usm_type_w]) + assert e.usm_type == dpt.get_coerced_usm_type([usm_type_v, usm_type_w]) @pytest.mark.parametrize( @@ -1247,7 +1252,7 @@ def test_histogram_bin_edges(usm_type_v, usm_type_w): edges = dpnp.histogram_bin_edges(v, weights=w) assert v.usm_type == usm_type_v assert w.usm_type == usm_type_w - assert edges.usm_type == du.get_coerced_usm_type([usm_type_v, usm_type_w]) + assert edges.usm_type == dpt.get_coerced_usm_type([usm_type_v, usm_type_w]) @pytest.mark.parametrize("usm_type_x", list_of_usm_types) @@ -1256,7 +1261,7 @@ def test_select(usm_type_x, usm_type_y): condlist = [dpnp.array([True, False], usm_type=usm_type_x)] choicelist = [dpnp.array([1, 2], usm_type=usm_type_y)] res = dpnp.select(condlist, choicelist) - assert res.usm_type == du.get_coerced_usm_type([usm_type_x, usm_type_y]) + assert res.usm_type == dpt.get_coerced_usm_type([usm_type_x, usm_type_y]) @pytest.mark.parametrize("axis", [None, 0, -1]) @@ -1300,7 +1305,7 @@ def test_ediff1d(usm_type_x, usm_type_args, to_end, to_begin): res = dpnp.ediff1d(x, to_end=to_end, to_begin=to_begin) - assert res.usm_type == du.get_coerced_usm_type([usm_type_x, usm_type_args]) + assert res.usm_type == dpt.get_coerced_usm_type([usm_type_x, usm_type_args]) @pytest.mark.parametrize("usm_type", list_of_usm_types) @@ -1337,7 +1342,7 @@ def test_choose(usm_type_x, usm_type_ind): assert chc.usm_type == usm_type_x assert ind.usm_type == usm_type_ind - assert z.usm_type == du.get_coerced_usm_type([usm_type_x, usm_type_ind]) + assert z.usm_type == dpt.get_coerced_usm_type([usm_type_x, usm_type_ind]) @pytest.mark.parametrize( @@ -1371,7 +1376,7 @@ def test_basic(self, usm_type_x, usm_type_xp, usm_type_fp): assert x.usm_type == usm_type_x assert xp.usm_type == usm_type_xp assert fp.usm_type == usm_type_fp - assert result.usm_type == du.get_coerced_usm_type( + assert result.usm_type == dpt.get_coerced_usm_type( [usm_type_x, usm_type_xp, usm_type_fp] ) @@ -1390,7 +1395,7 @@ def test_left_right(self, usm_type_x, usm_type_left, usm_type_right): assert left.usm_type == usm_type_left assert right.usm_type == usm_type_right - assert result.usm_type == du.get_coerced_usm_type( + assert result.usm_type == dpt.get_coerced_usm_type( [ x.usm_type, xp.usm_type, @@ -1523,7 +1528,7 @@ def test_lstsq(self, m, n, nrhs, usm_type, usm_type_other): assert a.usm_type == usm_type assert b.usm_type == usm_type_other for param in result: - assert param.usm_type == du.get_coerced_usm_type( + assert param.usm_type == dpt.get_coerced_usm_type( [usm_type, usm_type_other] ) @@ -1570,7 +1575,7 @@ def test_lu_solve(self, a_data, b_data, usm_type, usm_type_rhs): assert lu.usm_type == usm_type assert b.usm_type == usm_type_rhs - assert result.usm_type == du.get_coerced_usm_type( + assert result.usm_type == dpt.get_coerced_usm_type( [usm_type, usm_type_rhs] ) @@ -1730,7 +1735,7 @@ def test_solve(self, matrix, rhs, usm_type, usm_type_rhs): assert x.usm_type == usm_type assert y.usm_type == usm_type_rhs - assert z.usm_type == du.get_coerced_usm_type([usm_type, usm_type_rhs]) + assert z.usm_type == dpt.get_coerced_usm_type([usm_type, usm_type_rhs]) @pytest.mark.parametrize("full_matrices_param", [True, False]) @pytest.mark.parametrize("compute_uv_param", [True, False]) @@ -1796,6 +1801,6 @@ def test_tensorsolve(self, usm_type, usm_type_other): assert a.usm_type == usm_type assert b.usm_type == usm_type_other - assert result.usm_type == du.get_coerced_usm_type( + assert result.usm_type == dpt.get_coerced_usm_type( [usm_type, usm_type_other] ) diff --git a/dpnp/tests/test_utils.py b/dpnp/tests/test_utils.py index eef9132e5b55..aef6abba8726 100644 --- a/dpnp/tests/test_utils.py +++ b/dpnp/tests/test_utils.py @@ -1,8 +1,8 @@ -import dpctl.tensor as dpt import numpy import pytest import dpnp +import dpnp.tensor as dpt class TestIsSupportedArrayOrScalar: diff --git a/dpnp/tests/third_party/cupy/core_tests/test_dlpack.py b/dpnp/tests/third_party/cupy/core_tests/test_dlpack.py index 41df0a82e0a0..eb9e958fad0b 100644 --- a/dpnp/tests/third_party/cupy/core_tests/test_dlpack.py +++ b/dpnp/tests/third_party/cupy/core_tests/test_dlpack.py @@ -1,11 +1,11 @@ from __future__ import annotations import dpctl -import dpctl.tensor._dlpack as dlp import numpy import pytest import dpnp as cupy +import dpnp.tensor._dlpack as dlp from dpnp.tests.third_party.cupy import testing diff --git a/dpnp/tests/third_party/cupy/indexing_tests/test_insert.py b/dpnp/tests/third_party/cupy/indexing_tests/test_insert.py index 7399343e7e57..3b23b32fe3b2 100644 --- a/dpnp/tests/third_party/cupy/indexing_tests/test_insert.py +++ b/dpnp/tests/third_party/cupy/indexing_tests/test_insert.py @@ -84,7 +84,7 @@ def test_put(self, xp, dtype): # Take care so that actual indices don't overlap. if self.mode == "raise": pytest.skip("'raise' mode is not supported") - # `wrap` mode in dpctl.tensor.put is different from numpy.put (#1365): + # `wrap` mode in dpnp.tensor.put is different from numpy.put (#1365): # numpy`s `wrap` mode wraps indices around for cyclic operations # while dpctl`s `wrap` mode restricts indices to stay within the array bounds (-n <= i < n). elif self.mode == "wrap": diff --git a/pyproject.toml b/pyproject.toml index 78ebe9d9aa66..02567d2f25ad 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -108,7 +108,7 @@ target-version = ['py310', 'py311', 'py312', 'py313', 'py314'] [tool.codespell] builtin = "clear,rare,informal,names" check-filenames = true -ignore-words-list = "amin,arange,elemt,fro,hist,ith,mone,nd,nin,sinc,vart" +ignore-words-list = "amin,arange,elemt,fro,hist,ith,mone,nd,nin,sinc,vart,GroupT,AccessorT,IndexT,fpT,OffsetT,inpT, wit" quiet-level = 3 [tool.coverage.report] @@ -134,13 +134,21 @@ source = [ ensure_newline_before_comments = true force_grid_wrap = 0 include_trailing_comma = true +known_third_party = ["dpctl"] line_length = 80 multi_line_output = 3 +profile = "black" skip = ["dpnp/__init__.py"] split_on_trailing_comma = true use_parentheses = true [tool.pylint.basic] +disable = [ + "wrong-import-order", + "ungrouped-imports", + "wrong-import-position" +] +ignored-modules = ["dpctl", "dpctl.*"] include-naming-hint = true [tool.pylint.classes] diff --git a/setup.py b/setup.py index cc21221299c4..3f5449663508 100644 --- a/setup.py +++ b/setup.py @@ -34,6 +34,7 @@ cmdclass=versioneer.get_cmdclass(), packages=[ "dpnp", + "dpnp.tensor", "dpnp.dpnp_algo", "dpnp.dpnp_utils", "dpnp.exceptions", @@ -52,12 +53,14 @@ "dpnp_backend_c.lib", "dpnp_backend_c.dll", "tests/*.*", + "tests/tensor/*.py", + "tests/tensor/*/*.py", "tests/testing/*.py", "tests/third_party/cupy/*.py", "tests/third_party/cupy/*/*.py", "tests/third_party/cupyx/*.py", "tests/third_party/cupyx/*/*.py", - ] + ], }, include_package_data=False, )